iText pdf library
Website search

Extracting objects from a PDF

Example written in answer to the question How to extract embedded streams?

ExtractStreams.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
/*
    This file is part of the iText (R) project.
    Copyright (c) 1998-2019 iText Group NV
    Authors: iText Software.
 
    For more information, please contact iText Software at this address:
    sales@itextpdf.com
 */
/**
 * Example written by Bruno Lowagie in answer to the following question:
 * http://stackoverflow.com/questions/30286601/extracting-an-embedded-object-from-a-pdf
 */
package com.itextpdf.samples.sandbox.parse;
 
import com.itextpdf.kernel.PdfException;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfObject;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.PdfStream;
import com.itextpdf.test.annotations.type.SampleTest;
 
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.experimental.categories.Category;
 
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
 
@Category(SampleTest.class)
public class ExtractStreams {
    public static final String DEST = "./target/test/resources/sandbox/parse/extract_streams%s";
    public static final String SRC = "./src/test/resources/pdfs/image.pdf";
 
    @BeforeClass
    public static void before() {
        new File(DEST).getParentFile().mkdirs();
    }
 
    public static void main(String[] args) throws IOException {
        before();
        new ExtractStreams().manipulatePdf();
    }
 
    @Test
    public void manipulatePdf() throws IOException {
        PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));
        PdfObject obj;
        List<Integer> streamLengths = new ArrayList<>();
        for (int i = 1; i <= pdfDoc.getNumberOfPdfObjects(); i++) {
            obj = pdfDoc.getPdfObject(i);
            if (obj != null && obj.isStream()) {
                byte[] b;
                try {
                    b = ((PdfStream) obj).getBytes();
                } catch (PdfException exc) {
                    b = ((PdfStream) obj).getBytes(false);
                }
                System.out.println(b.length);
                FileOutputStream fos = new FileOutputStream(String.format(DEST, i));
                fos.write(b);
 
                streamLengths.add(b.length);
                fos.close();
            }
        }
        Assert.assertArrayEquals(new Integer[]{30965, 74}, streamLengths.toArray(new Integer[streamLengths.size()]));
        pdfDoc.close();
    }
}

Resources

ExtractStreams.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
/*
This file is part of the iText (R) project.
Copyright (c) 1998-2019 iText Group NV
Authors: iText Software.
 
For more information, please contact iText Software at this address:
sales@itextpdf.com
*/
 
using System;
using System.IO;
using iText.Kernel;
using iText.Kernel.Pdf;
 
namespace iText.Samples.Sandbox.Parse
{
    public class ExtractStreams
    {
        public static readonly String DEST = "../../results/sandbox/parse";
 
        public static readonly String SRC = "../../resources/pdfs/image.pdf";
 
        public static void Main(String[] args)
        {
            Directory.CreateDirectory(DEST);
 
            new ExtractStreams().ManipulatePdf(DEST);
        }
 
        protected void ManipulatePdf(String dest)
        {
            PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));
 
            int numberOfPdfObject = pdfDoc.GetNumberOfPdfObjects();
            for (int i = 1; i <= numberOfPdfObject; i++)
            {
                PdfObject obj = pdfDoc.GetPdfObject(i);
                if (obj != null && obj.IsStream())
                {
                    byte[] b;
                    try
                    {
                        
                        // Get decoded stream bytes.
                        b = ((PdfStream) obj).GetBytes();
                    }
                    catch (PdfException)
                    {
                        
                        // Get originally encoded stream bytes
                        b = ((PdfStream) obj).GetBytes(false);
                    }
 
                    using (FileStream fos = new FileStream(String.Format(dest + "/extract_streams{0}.dat", i), FileMode.Create))
                    {
                        fos.Write(b, 0, b.Length);
                    }
                }
            }
 
            pdfDoc.Close();
        }
    }
}

Click the following link to see the legacy example for iText 5. Except for security fixes, iText 5 is no longer being developed.



Contact

Still have questions? 

We're happy to answer your questions. Reach out to us and we'll get back to you shortly.

Contact us
Stay updated

Join 11,000+ subscribers and become an iText PDF expert by staying up to date with our new products, updates, tips, technical solutions and happenings.

Subscribe Now