Created
March 11, 2012 22:30
-
-
Save fkuehnel/2018466 to your computer and use it in GitHub Desktop.
PDFBox extension to extract previously tagged math areas from a PDF document
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.apache.pdfbox; | |
import org.apache.pdfbox.exceptions.InvalidPasswordException; | |
import org.apache.pdfbox.pdmodel.PDDocument; | |
import org.apache.pdfbox.pdmodel.PDPage; | |
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; | |
import org.apache.pdfbox.pdmodel.common.PDRectangle; | |
import org.apache.pdfbox.util.PDFTextStripperByArea; | |
import java.awt.geom.Rectangle2D; | |
import java.util.List; | |
public class ExtractMath | |
{ | |
// ExtractMath is a namespace | |
private ExtractMath() {} | |
public static void main(String[] args) throws Exception | |
{ | |
int argc = args.length; | |
if (argc < 1) { | |
usage(); | |
return; | |
} | |
PDDocument document = null; | |
float yOffset = 0.0f; | |
float addHeight = 0.0f; | |
try { | |
document = PDDocument.load(args[0]); | |
if (argc > 1) { | |
yOffset = Float.parseFloat(args[1]); | |
if (argc > 2) | |
addHeight = Float.parseFloat(args[2]); | |
} | |
if (document.isEncrypted()) { | |
try { | |
document.decrypt( "" ); | |
} catch( InvalidPasswordException e ) { | |
System.err.println( "Error: Document is encrypted with a password." ); | |
System.exit( 1 ); | |
} | |
} | |
List<PDPage> allPages = document.getDocumentCatalog().getAllPages(); | |
int pageIdx = 0; | |
for (PDPage page : allPages) { | |
PDFTextStripperByArea stripper = new PDFTextStripperByArea(); | |
stripper.setSortByPosition(true); | |
PDRectangle cropBox = page.findCropBox(); | |
System.out.println("Crop box for page " + pageIdx + ": "+ cropBox); | |
PDRectangle mediaBox = page.findMediaBox(); | |
System.out.println("Media box for this page: " + pageIdx + ": " + mediaBox); | |
List<PDAnnotation> pageAnnotations = page.getAnnotations(); | |
int formulaIdx = 0; | |
for(PDAnnotation annot: pageAnnotations) { | |
if ("MathML".equals(annot.getSubtype())) { | |
PDRectangle pdRect = annot.getRectangle(); | |
float h = pdRect.getHeight() + addHeight; | |
float w = pdRect.getWidth(); | |
float y = mediaBox.getHeight() - pdRect.getLowerLeftY() - h + yOffset; | |
Rectangle2D rect = new Rectangle2D.Float(); | |
rect.setRect(pdRect.getLowerLeftX()-0.01f, y + 0.01f, w, h); | |
System.out.println("define region: " + rect); | |
stripper.addRegion("formula"+formulaIdx, rect); | |
formulaIdx += 1; | |
} | |
} | |
stripper.extractRegions(page); | |
formulaIdx = 0; | |
for(PDAnnotation annot: pageAnnotations) { | |
if ("MathML".equals(annot.getSubtype())) { | |
System.out.println("MathML formula:"); | |
System.out.println(annot.getContents()); | |
PDRectangle pdRect = annot.getRectangle(); | |
System.out.println("Text in the area: " + pdRect); | |
System.out.println(stripper.getTextForRegion("formula"+formulaIdx)); | |
formulaIdx += 1; | |
} | |
} | |
} | |
} | |
finally { | |
if (document != null) | |
document.close(); | |
} | |
} | |
/** | |
* This will print the usage for this program. | |
*/ | |
private static void usage() | |
{ | |
System.err.println( "Usage: java " + ExtractMath.class.getName() + " <input-pdf>" ); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This file needs to be placed inside the src/main/java/org/apache/pdfbox folder. Additionally, PDFBox.java needs to be modified to include the new ExtractMath command: