Skip to content

Instantly share code, notes, and snippets.

@rileymjohnson
Created March 22, 2023 19:51
Show Gist options
  • Save rileymjohnson/a1175752bac7c84e586d77186e6b3b96 to your computer and use it in GitHub Desktop.
Save rileymjohnson/a1175752bac7c84e586d77186e6b3b96 to your computer and use it in GitHub Desktop.
Extra the letters of a PDF document and their positions
/*
When loading the PDF document, you need to set the `fontExtraProperties` parameter to `true`. e.g.
getDocument({
...,
fontExtraProperties: true
})
*/
var generateId = () => {
return (
Date.now().toString(36) +
Math.floor(
Math.pow(10, 12) + Math.random() * 9 * Math.pow(10, 12)
).toString(36)
)
}
function itemToRect(item, viewport, heightScale=1) {
const transformed = Util.transform(
viewport.transform,
item.transform
)
let [left, top] = Util.applyTransform(
[0, 1],
transformed
)
let width = Util.applyTransform(
[item.width, 0],
viewport.transform
)[0]
let height = Util.applyTransform(
[item.height, 0],
viewport.transform
)[0]
left /= viewport.width
width /= viewport.width
top /= viewport.height
height /= viewport.height
height *= heightScale
return { left, top, width, height }
}
function* itemToRects(item, page, heightScale) {
const { viewport } = viewer.getPageView(page.pageNumber - 1)
const bbox = {
...itemToRect(item, viewport, heightScale),
page: page.pageNumber
}
const itemFont = page.commonObjs.get(item.fontName)
const letterItems = []
let itemFontWidth = 0
let itemText = item.str
if (item.hasEOL) {
itemText += '\n'
}
for (const letter of itemText) {
const fontWidth = itemFont.widths[
letter.charCodeAt()
] || 0
letterItems.push({
letter,
fontWidth,
leftOffset: itemFontWidth
})
itemFontWidth += fontWidth
}
for (const letterItem of letterItems) {
const { letter, fontWidth, leftOffset } = letterItem
let left = 0
let width = 0
if (itemFontWidth > 0) {
left = leftOffset / itemFontWidth * bbox.width + bbox.left
width = fontWidth / itemFontWidth * bbox.width
}
yield {
letter,
left,
width,
page: bbox.page,
top: bbox.top,
height: bbox.height,
}
}
}
async function getLetterRects(page) {
const { items } = await page.getTextContent()
const formattedLetterRects = []
for (const item of items) {
const letterRects = itemToRects(item, page, heightScale=1.25)
for (const letterRect of letterRects) {
const { letter, ...rect } = letterRect
formattedLetterRects.push({
id: generateId(),
text: letter,
position: {
page: page.pageNumber,
rects: [rect]
}
})
}
}
return formattedLetterRects
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment