Parsing PDFs

This is a code example of iText PDF, discover more.

31st May 2016
admin-marketing

Switch code for this example

ParseCustom.java
  1. /*
  2.  
  3.     This file is part of the iText (R) project.
  4.     Copyright (c) 1998-2016 iText Group NV
  5.  
  6. */
  7.  
  8. /**
  9.  * Example written by Bruno Lowagie in answer to:
  10.  * http://stackoverflow.com/questions/24506830/can-we-use-text-extraction-strategy-after-applying-location-extraction-strategy
  11.  */
  12. package com.itextpdf.samples.sandbox.parse;
  13.  
  14. import com.itextpdf.kernel.font.PdfFont;
  15. import com.itextpdf.kernel.geom.Rectangle;
  16. import com.itextpdf.kernel.pdf.PdfDocument;
  17. import com.itextpdf.kernel.pdf.PdfReader;
  18. import com.itextpdf.kernel.pdf.canvas.parser.EventType;
  19. import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
  20. import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData;
  21. import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo;
  22. import com.itextpdf.kernel.pdf.canvas.parser.filter.TextRegionEventFilter;
  23. import com.itextpdf.kernel.pdf.canvas.parser.listener.FilteredEventListener;
  24. import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
  25. import com.itextpdf.test.annotations.type.SampleTest;
  26.  
  27. import org.junit.Assert;
  28. import org.junit.BeforeClass;
  29. import org.junit.Test;
  30. import org.junit.experimental.categories.Category;
  31.  
  32. import java.io.File;
  33. import java.io.IOException;
  34.  
  35. @Category(SampleTest.class)
  36. public class ParseCustom {
  37.     public static final String SRC = "./src/test/resources/pdfs/nameddestinations.pdf";
  38.     public static final String EXPECTED_TEXT = "Country List\n" +
  39.             "Internet Movie Database";
  40.  
  41.  
  42.     @BeforeClass
  43.     public static void beforeClass() throws IOException {
  44.         File file = new File(SRC);
  45.         file.getParentFile().mkdirs();
  46.     }
  47.  
  48.     @Test
  49.     public void manipulatePdf() throws IOException {
  50.         PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));
  51.         Rectangle rect = new Rectangle(36, 750, 523, 56);
  52.  
  53.         FontFilter fontFilter = new FontFilter(rect);
  54.         FilteredEventListener listener = new FilteredEventListener();
  55.         LocationTextExtractionStrategy extractionStrategy = listener.attachEventListener(new LocationTextExtractionStrategy(), fontFilter);
  56.         new PdfCanvasProcessor(listener).processPageContent(pdfDoc.getFirstPage());
  57.  
  58.         String actualText = extractionStrategy.getResultantText();
  59.         System.out.println(actualText);
  60.  
  61.         pdfDoc.close();
  62.  
  63.         Assert.assertEquals(EXPECTED_TEXT, actualText);
  64.     }
  65.  
  66.  
  67.     class FontFilter extends TextRegionEventFilter {
  68.         public FontFilter(Rectangle filterRect) {
  69.             super(filterRect);
  70.         }
  71.  
  72.         @Override
  73.         public boolean accept(IEventData data, EventType type) {
  74.             if (type.equals(EventType.RENDER_TEXT)) {
  75.                 TextRenderInfo renderInfo = (TextRenderInfo) data;
  76.  
  77.                 PdfFont font = renderInfo.getFont();
  78.                 if (null != font) {
  79.                     String fontName = font.getFontProgram().getFontNames().getFontName();
  80.                     return fontName.endsWith("Bold") || fontName.endsWith("Oblique");
  81.                 }
  82.             }
  83.             return false;
  84.         }
  85.     }
  86. }
ParseCzech.java
  1. /*
  2.  
  3.     This file is part of the iText (R) project.
  4.     Copyright (c) 1998-2016 iText Group NV
  5.  
  6. */
  7.  
  8. /**
  9.  * This example was written by Bruno Lowagie in answer to the following question:
  10.  * https://www.linkedin.com/groups/Script-Change-Author-Name-Comments-159987.S.5984062085800144899
  11.  */
  12. package com.itextpdf.samples.sandbox.parse;
  13.  
  14. import com.itextpdf.kernel.pdf.PdfDocument;
  15. import com.itextpdf.kernel.pdf.PdfReader;
  16. import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
  17. import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
  18. import com.itextpdf.test.annotations.type.SampleTest;
  19.  
  20. import org.junit.Assert;
  21. import org.junit.BeforeClass;
  22. import org.junit.Test;
  23. import org.junit.experimental.categories.Category;
  24.  
  25. import java.io.File;
  26. import java.io.FileOutputStream;
  27. import java.io.IOException;
  28.  
  29. @Category(SampleTest.class)
  30. public class ParseCzech {
  31.     public static final String DEST = "./target/test/resources/sandbox/parse/czech.txt";
  32.     public static final String SRC = "./src/test/resources/pdfs/czech.pdf";
  33.  
  34.     @BeforeClass
  35.     public static void beforeClass() throws IOException {
  36.         File file = new File(DEST);
  37.         file.getParentFile().mkdirs();
  38.         new ParseCzech().manipulatePdf();
  39.     }
  40.  
  41.     @Test
  42.     public void manipulatePdf() throws IOException {
  43.         PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));
  44.         FileOutputStream fos = new FileOutputStream(DEST);
  45.  
  46.         LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();
  47.  
  48.         PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);
  49.         parser.processPageContent(pdfDoc.getFirstPage());
  50.         byte[] array = strategy.getResultantText().getBytes("UTF-8");
  51.         fos.write(array);
  52.  
  53.         fos.flush();
  54.         fos.close();
  55.  
  56.         pdfDoc.close();
  57.  
  58.         Assert.assertEquals(67, array.length);
  59.     }
  60. }
Contact

Still have questions? 

We're happy to answer your questions. Reach out to us and we'll get back to you shortly.

Contact us
Stay updated

Join 11,000+ subscribers and become an iText PDF expert by staying up to date with our new products, updates, tips, technical solutions and happenings.

Subscribe Now