iText in Action: Chapter 15: Page content and structure

part4.chapter15.ParsingHelloWorld

If you compile and execute this example, you'll get the following result:

You can download the full source code of ParsingHelloWorld, or read it here:

/*
 * This class is part of the book "iText in Action - 2nd Edition"
 * written by Bruno Lowagie (ISBN: 9781935182610)
 * For more info, go to: http://itextpdf.com/examples/
 * This example only works with the AGPL version of iText.
 */
 
package part4.chapter15;
 
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
 
import part1.chapter01.HelloWorld;
 
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.io.RandomAccessSourceFactory;
import com.itextpdf.text.pdf.BaseFont;
import com.itextpdf.text.pdf.PRTokeniser;
import com.itextpdf.text.pdf.PdfContentByte;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfTemplate;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.text.pdf.RandomAccessFileOrArray;
import com.itextpdf.text.pdf.parser.ContentByteUtils;
import com.itextpdf.text.pdf.parser.PdfContentStreamProcessor;
import com.itextpdf.text.pdf.parser.RenderListener;
 
public class ParsingHelloWorld {
 
    /** The resulting PDF. */
    public static final String PDF = "results/part4/chapter15/hello_reverse.pdf";
    /** A possible resulting after parsing the PDF. */
    public static final String TEXT1 = "results/part4/chapter15/result1.txt";
    /** A possible resulting after parsing the PDF. */
    public static final String TEXT2 = "results/part4/chapter15/result2.txt";
    /** A possible resulting after parsing the PDF. */
    public static final String TEXT3 = "results/part4/chapter15/result3.txt";
 
    /**
     * Generates a PDF file with the text 'Hello World'
     * @throws DocumentException 
     * @throws IOException 
     */
    public void createPdf(String filename) throws DocumentException, IOException {
        // step 1
    	Document document = new Document();
        // step 2
    	PdfWriter writer
          = PdfWriter.getInstance(document, new FileOutputStream(filename));
        // step 3
    	document.open();
    	// step 4
        // we add the text to the direct content, but not in the right order
        PdfContentByte cb = writer.getDirectContent();
        BaseFont bf = BaseFont.createFont();
        cb.beginText();
        cb.setFontAndSize(bf, 12);
        cb.moveText(88.66f, 367); 
        cb.showText("ld");
        cb.moveText(-22f, 0); 
        cb.showText("Wor");
        cb.moveText(-15.33f, 0); 
        cb.showText("llo");
        cb.moveText(-15.33f, 0); 
        cb.showText("He");
        cb.endText();
        // we also add text in a form XObject
        PdfTemplate tmp = cb.createTemplate(250, 25);
        tmp.beginText();
        tmp.setFontAndSize(bf, 12);
        tmp.moveText(0, 7);
        tmp.showText("Hello People");
        tmp.endText();
        cb.addTemplate(tmp, 36, 343);
        // step 5
        document.close();
    }
 
    /**
     * Parses the PDF using PRTokeniser
     * @param src  the path to the original PDF file
     * @param dest the path to the resulting text file
     * @throws IOException
     */
    public void parsePdf(String src, String dest) throws IOException {
        PdfReader reader = new PdfReader(src);
        // we can inspect the syntax of the imported page
        byte[] streamBytes = reader.getPageContent(1);
        PRTokeniser tokenizer = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().createSource(streamBytes)));
        PrintWriter out = new PrintWriter(new FileOutputStream(dest));
        while (tokenizer.nextToken()) {
            if (tokenizer.getTokenType() == PRTokeniser.TokenType.STRING) {
                out.println(tokenizer.getStringValue());
            }
        }
        out.flush();
        out.close();
        reader.close();
    }
 
    /**
     * Extracts text from a PDF document.
     * @param src  the original PDF document
     * @param dest the resulting text file
     * @throws IOException
     */
    public void extractText(String src, String dest) throws IOException {
        PrintWriter out = new PrintWriter(new FileOutputStream(dest));
        PdfReader reader = new PdfReader(src);
        RenderListener listener = new MyTextRenderListener(out);
        PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener);
        PdfDictionary pageDic = reader.getPageN(1);
        PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);
        processor.processContent(ContentByteUtils.getContentBytesForPage(reader, 1), resourcesDic);
        out.flush();
        out.close();
        reader.close();
    }
 
    /**
     * Main method.
     * @param    args    no arguments needed
     * @throws DocumentException 
     * @throws IOException
     */
    public static void main(String[] args) throws DocumentException, IOException {
        ParsingHelloWorld example = new ParsingHelloWorld();
        HelloWorld.main(args);
        example.createPdf(PDF);
        example.parsePdf(HelloWorld.RESULT, TEXT1);
        example.parsePdf(PDF, TEXT2);
        example.extractText(PDF, TEXT3);
    }
}
 
Powered by GeSHi

The ParsingHelloWorld example is part of the book iText in Action (ISBN 9781935182610).

It's a small standalone application. You can use this example for inspiration, but please read the book if there's something you don't understand about the example. You'll find more info in Chapter 15. Note that this example depends on HelloWorld, MyTextRenderListener, and maybe some other examples.

Keywords for this example: Existing PDFs, Hello World, Parsing PDF, PdfReader, PdfTemplate, PRTokeniser, Parsing PDF > extracting text, Parsing PDF > read content stream

If you want this example to work, you need the following jars: iText.jar