fkuehnel · March 11, 2012 22:30 · fkuehnel · Mar 11, 2012
diff --git a/ExtractMath.java b/ExtractMath.java
 package org.apache.pdfbox;

 import org.apache.pdfbox.exceptions.InvalidPasswordException;

 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
 import org.apache.pdfbox.pdmodel.common.PDRectangle;
 import org.apache.pdfbox.util.PDFTextStripperByArea;

 import java.awt.geom.Rectangle2D;

 import java.util.List;

 public class ExtractMath
 {
    // ExtractMath is a namespace
    private ExtractMath() {}

    public static void main(String[] args) throws Exception
    {
        int argc = args.length;
        if (argc < 1) {
            usage();
            return;
        }

        PDDocument document = null;
        float yOffset   = 0.0f;
        float addHeight = 0.0f;
        try {
            document = PDDocument.load(args[0]);
            if (argc > 1) {
                yOffset = Float.parseFloat(args[1]);
                if (argc > 2)
                    addHeight = Float.parseFloat(args[2]);
            }

            if (document.isEncrypted()) {
                try {
                    document.decrypt( "" );
                    
                } catch( InvalidPasswordException e ) {
                        System.err.println( "Error: Document is encrypted with a password." );
                        System.exit( 1 );
                }
            }

            List<PDPage> allPages = document.getDocumentCatalog().getAllPages();
            int pageIdx = 0;
            for (PDPage page : allPages) {
                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
                stripper.setSortByPosition(true);

                PDRectangle cropBox = page.findCropBox();
                System.out.println("Crop box for page " + pageIdx + ": "+ cropBox);
                PDRectangle mediaBox = page.findMediaBox();
                System.out.println("Media box for this page: " + pageIdx + ": " + mediaBox);

                List<PDAnnotation> pageAnnotations = page.getAnnotations();
                int formulaIdx = 0;
                for(PDAnnotation annot: pageAnnotations) {
                    if ("MathML".equals(annot.getSubtype())) {
                        PDRectangle pdRect = annot.getRectangle();
                        float h = pdRect.getHeight() + addHeight;
                        float w = pdRect.getWidth();
                        float y = mediaBox.getHeight() - pdRect.getLowerLeftY() - h + yOffset;
                        Rectangle2D rect = new Rectangle2D.Float();
                        rect.setRect(pdRect.getLowerLeftX()-0.01f, y + 0.01f, w, h);
                        System.out.println("define region: " + rect);
                        stripper.addRegion("formula"+formulaIdx, rect);
                        formulaIdx += 1;
                    }
                }

                stripper.extractRegions(page);
                formulaIdx = 0;
                for(PDAnnotation annot: pageAnnotations) {
                    if ("MathML".equals(annot.getSubtype())) {
                        System.out.println("MathML formula:");
                        System.out.println(annot.getContents());
                        PDRectangle pdRect = annot.getRectangle();
                        System.out.println("Text in the area: " + pdRect);
                        System.out.println(stripper.getTextForRegion("formula"+formulaIdx));
                        formulaIdx += 1;
                    }
                }
            }
        }
        finally {
            if (document != null)
                document.close();
        }
    }

    /**
      * This will print the usage for this program.
      */
    private static void usage()
    {
        System.err.println( "Usage: java " + ExtractMath.class.getName() + " <input-pdf>" );
    }

 }
	package org.apache.pdfbox;

	import org.apache.pdfbox.exceptions.InvalidPasswordException;

	import org.apache.pdfbox.pdmodel.PDDocument;
	import org.apache.pdfbox.pdmodel.PDPage;
	import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
	import org.apache.pdfbox.pdmodel.common.PDRectangle;
	import org.apache.pdfbox.util.PDFTextStripperByArea;

	import java.awt.geom.Rectangle2D;

	import java.util.List;

	public class ExtractMath
	{
	// ExtractMath is a namespace
	private ExtractMath() {}

	public static void main(String[] args) throws Exception
	{
	int argc = args.length;
	if (argc < 1) {
	usage();
	return;
	}

	PDDocument document = null;
	float yOffset = 0.0f;
	float addHeight = 0.0f;
	try {
	document = PDDocument.load(args[0]);
	if (argc > 1) {
	yOffset = Float.parseFloat(args[1]);
	if (argc > 2)
	addHeight = Float.parseFloat(args[2]);
	}

	if (document.isEncrypted()) {
	try {
	document.decrypt( "" );

	} catch( InvalidPasswordException e ) {
	System.err.println( "Error: Document is encrypted with a password." );
	System.exit( 1 );
	}
	}

	List<PDPage> allPages = document.getDocumentCatalog().getAllPages();
	int pageIdx = 0;
	for (PDPage page : allPages) {
	PDFTextStripperByArea stripper = new PDFTextStripperByArea();
	stripper.setSortByPosition(true);

	PDRectangle cropBox = page.findCropBox();
	System.out.println("Crop box for page " + pageIdx + ": "+ cropBox);
	PDRectangle mediaBox = page.findMediaBox();
	System.out.println("Media box for this page: " + pageIdx + ": " + mediaBox);

	List<PDAnnotation> pageAnnotations = page.getAnnotations();
	int formulaIdx = 0;
	for(PDAnnotation annot: pageAnnotations) {
	if ("MathML".equals(annot.getSubtype())) {
	PDRectangle pdRect = annot.getRectangle();
	float h = pdRect.getHeight() + addHeight;
	float w = pdRect.getWidth();
	float y = mediaBox.getHeight() - pdRect.getLowerLeftY() - h + yOffset;
	Rectangle2D rect = new Rectangle2D.Float();
	rect.setRect(pdRect.getLowerLeftX()-0.01f, y + 0.01f, w, h);
	System.out.println("define region: " + rect);
	stripper.addRegion("formula"+formulaIdx, rect);
	formulaIdx += 1;
	}
	}

	stripper.extractRegions(page);
	formulaIdx = 0;
	for(PDAnnotation annot: pageAnnotations) {
	if ("MathML".equals(annot.getSubtype())) {
	System.out.println("MathML formula:");
	System.out.println(annot.getContents());
	PDRectangle pdRect = annot.getRectangle();
	System.out.println("Text in the area: " + pdRect);
	System.out.println(stripper.getTextForRegion("formula"+formulaIdx));
	formulaIdx += 1;
	}
	}
	}
	}
	finally {
	if (document != null)
	document.close();
	}
	}

	/**
	* This will print the usage for this program.
	*/
	private static void usage()
	{
	System.err.println( "Usage: java " + ExtractMath.class.getName() + " <input-pdf>" );
	}

	}