Parsing PDFs

25th October 2015
admin-marketing

Switch code for this example

ParseCustom.java
  1. /*
  2.  * Example written by Bruno Lowagie in answer to:
  3.  * http://stackoverflow.com/questions/24506830/can-we-use-text-extraction-strategy-after-applying-location-extraction-strategy
  4.  */
  5.  
  6. package sandbox.parse;
  7.  
  8. import com.itextpdf.text.DocumentException;
  9. import com.itextpdf.text.Rectangle;
  10. import com.itextpdf.text.pdf.PdfReader;
  11. import com.itextpdf.text.pdf.parser.FilteredTextRenderListener;
  12. import com.itextpdf.text.pdf.parser.LocationTextExtractionStrategy;
  13. import com.itextpdf.text.pdf.parser.PdfTextExtractor;
  14. import com.itextpdf.text.pdf.parser.RegionTextRenderFilter;
  15. import com.itextpdf.text.pdf.parser.RenderFilter;
  16. import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
  17. import com.itextpdf.text.pdf.parser.TextRenderInfo;
  18.  
  19. import java.io.IOException;
  20.  
  21. public class ParseCustom {
  22.  
  23.     public static final String SRC = "resources/pdfs/nameddestinations.pdf";
  24.  
  25.     class FontRenderFilter extends RenderFilter {
  26.         public boolean allowText(TextRenderInfo renderInfo) {
  27.             String font = renderInfo.getFont().getPostscriptFontName();
  28.             return font.endsWith("Bold") || font.endsWith("Oblique");
  29.         }
  30.     }
  31.    
  32.     public static void main(String[] args) throws IOException, DocumentException {
  33.         new ParseCustom().parse(SRC);
  34.     }
  35.    
  36.     public void parse(String filename) throws IOException {
  37.         PdfReader reader = new PdfReader(filename);
  38.         Rectangle rect = new Rectangle(36, 750, 559, 806);
  39.         RenderFilter regionFilter = new RegionTextRenderFilter(rect);
  40.         FontRenderFilter fontFilter = new FontRenderFilter();
  41.         TextExtractionStrategy strategy = new FilteredTextRenderListener(
  42.                 new LocationTextExtractionStrategy(), regionFilter, fontFilter);
  43.         System.out.println(PdfTextExtractor.getTextFromPage(reader, 1, strategy));
  44.         reader.close();
  45.     }
  46. }
ParseCzech.java
  1. /*
  2.  * Example written by Bruno Lowagie in answer to:
  3.  * http://stackoverflow.com/questions/26670919/itextsharp-diacritic-chars
  4.  */
  5. package sandbox.parse;
  6.  
  7. import com.itextpdf.text.DocumentException;
  8. import com.itextpdf.text.pdf.PdfReader;
  9. import com.itextpdf.text.pdf.parser.PdfTextExtractor;
  10.  
  11. import java.io.File;
  12. import java.io.FileOutputStream;
  13. import java.io.IOException;
  14.  
  15. /**
  16.  *
  17.  * @author Bruno Lowagie (iText Software)
  18.  */
  19. public class ParseCzech {
  20.    
  21.     public static final String SRC = "resources/pdfs/czech.pdf";
  22.     public static final String DEST = "results/parse/czech.txt";
  23.            
  24.     public static void main(String[] args) throws IOException, DocumentException {
  25.         File file = new File(DEST);
  26.         file.getParentFile().mkdirs();
  27.         new ParseCzech().parse(SRC);
  28.     }
  29.    
  30.    
  31.     public void parse(String filename) throws IOException {
  32.         PdfReader reader = new PdfReader(filename);
  33.         FileOutputStream fos = new FileOutputStream(DEST);
  34.         for (int page = 1; page
Contact

Still have questions? 

We're happy to answer your questions. Reach out to us and we'll get back to you shortly.

Contact us
Stay updated

Join 11,000+ subscribers and become an iText PDF expert by staying up to date with our new products, updates, tips, technical solutions and happenings.

Subscribe Now