Skip to content

Instantly share code, notes, and snippets.

@d4rkd3v1l
Last active July 6, 2024 13:48
Show Gist options
  • Save d4rkd3v1l/3b7d1d68f28eff1a8a4eee73934f288d to your computer and use it in GitHub Desktop.
Save d4rkd3v1l/3b7d1d68f28eff1a8a4eee73934f288d to your computer and use it in GitHub Desktop.
Simple Python script to index pdf files
import subprocess
from dataclasses import dataclass
from itertools import groupby
from docx import Document
from docx.shared import Inches, Pt, Mm
from docx.enum.text import WD_BREAK, WD_ALIGN_PARAGRAPH
from docx.oxml import OxmlElement, parse_xml, ns
from docx.oxml.ns import nsdecls, qn
@dataclass
class Match:
book: int
page: int
count: int
@dataclass
class Search:
searchTerm: str
matches: list[Match]
@dataclass
class Result:
@dataclass
class Book:
@dataclass
class SequencedMatch:
start: int
end: int
count: int
number: int
matches: list[SequencedMatch]
searchTerm: str
books: list[Book]
def main():
with open('terms.txt', 'r') as terms:
results = []
while term := terms.readline():
term = term.rstrip()
search = Search(term, [])
print(term)
pdfgrep = "pdfgrep --cache -p -o -n \"" + term + "\" * 2>/dev/null; exit 0"
result = subprocess.check_output(pdfgrep, stderr=subprocess.STDOUT, shell=True)
result = result.decode('UTF-8').replace('.pdf', '')
for resultLine in result.splitlines():
resultElements = resultLine.split(':')
resultPage = int(resultElements[1])-2 # Offset, when page numbering doesn't start on the first page
if resultPage > 0:
match = Match(int(resultElements[0]), resultPage, int(resultElements[2]))
search.matches.append(match)
if search.matches:
finalResult = sequenceMatches(search)
results.append(finalResult)
else:
print("-------------- No result for \"" + search.searchTerm + "\" --------------")
createDocx(results)
def sequences(matches: list[Match]) -> [Result.Book.SequencedMatch]:
start, end, count = matches[0], matches[0], 0
currentPage = start.page
for match in matches:
if currentPage != match.page:
yield Result.Book.SequencedMatch(start.page, end.page, count)
start, end, count = match, match, 0
currentPage = match.page
end = match
count += match.count
currentPage += 1
yield Result.Book.SequencedMatch(start.page, end.page, count)
def sequenceMatches(search: Search) -> Result:
result = Result(search.searchTerm, [])
for bookNumber, matches in groupby(search.matches, lambda x: x.book):
matches = list(matches)
book = Result.Book(bookNumber, list(sequences(matches)))
result.books.append(book)
return result
# https://stackoverflow.com/a/56676220/2019384
def create_element(name):
return OxmlElement(name)
def create_attribute(element, name, value):
element.set(ns.qn(name), value)
def add_page_number(run):
fldChar1 = create_element('w:fldChar')
create_attribute(fldChar1, 'w:fldCharType', 'begin')
instrText = create_element('w:instrText')
create_attribute(instrText, 'xml:space', 'preserve')
instrText.text = "PAGE"
fldChar2 = create_element('w:fldChar')
create_attribute(fldChar2, 'w:fldCharType', 'end')
run._r.append(fldChar1)
run._r.append(instrText)
run._r.append(fldChar2)
# https://github.com/python-openxml/python-docx/issues/433#issuecomment-358566765
def modifyBorder(table):
tbl = table._tbl # get xml element in table
for cell in tbl.iter_tcs():
tcPr = cell.tcPr # get tcPr element, in which we can define style of borders
tcBorders = OxmlElement('w:tcBorders')
top = OxmlElement('w:top')
top.set(qn('w:val'), 'nil')
left = OxmlElement('w:left')
left.set(qn('w:val'), 'nil')
bottom = OxmlElement('w:bottom')
bottom.set(qn('w:val'), 'nil')
right = OxmlElement('w:right')
right.set(qn('w:val'), 'nil')
tcBorders.append(top)
tcBorders.append(left)
tcBorders.append(bottom)
tcBorders.append(right)
tcPr.append(tcBorders)
def createDocx(results: list[Result]):
document = Document()
style = document.styles['Normal']
style.font.name = "Helvetica Neue"
style.font.size = Pt(9)
style.paragraph_format.space_before = Pt(2)
style.paragraph_format.space_after = Pt(2)
# Make it DIN A4
section = document.sections[0]
section.page_height = Mm(297)
section.page_width = Mm(210)
section.left_margin = Mm(25.4)
section.right_margin = Mm(25.4)
section.top_margin = Mm(25.4)
section.bottom_margin = Mm(25.4)
section.header_distance = Mm(12.7)
section.footer_distance = Mm(12.7)
document.add_heading('Index', 0)
footer = document.sections[0].footer.paragraphs[0]
footer.alignment = WD_ALIGN_PARAGRAPH.RIGHT
add_page_number(footer.add_run())
results.sort(key=lambda x: x.searchTerm.upper(), reverse=False)
previousLetter = ''
for result in results:
topMatchCount = max(map(lambda book: max(map(lambda match: match.count, book.matches)), result.books))
currentLetter = result.searchTerm[0].upper()
if previousLetter != currentLetter:
if currentLetter.isalpha():
paragraph = document.add_paragraph()
run = paragraph.add_run()
run.add_break(WD_BREAK.PAGE)
document.add_heading(currentLetter, 1)
table = document.add_table(rows = 0, cols = 2)
table.style = 'Table Grid'
index = 0
previousLetter = currentLetter
table.add_row()
# Hack for alternating background colors^^
if index % 2 == 1:
shading_elm_1 = parse_xml(r'<w:shd {} w:fill="FFFFFF"/>'.format(nsdecls('w')))
table.rows[index].cells[0]._tc.get_or_add_tcPr().append(shading_elm_1)
shading_elm_2 = parse_xml(r'<w:shd {} w:fill="FFFFFF"/>'.format(nsdecls('w')))
table.rows[index].cells[1]._tc.get_or_add_tcPr().append(shading_elm_2)
else:
shading_elm_1 = parse_xml(r'<w:shd {} w:fill="F0F0F0"/>'.format(nsdecls('w')))
table.rows[index].cells[0]._tc.get_or_add_tcPr().append(shading_elm_1)
shading_elm_2 = parse_xml(r'<w:shd {} w:fill="F0F0F0"/>'.format(nsdecls('w')))
table.rows[index].cells[1]._tc.get_or_add_tcPr().append(shading_elm_2)
cells = table.rows[index].cells
cells[0].text = result.searchTerm
bookIndex = 0
for book in result.books:
bookIndex += 1
cells[1].paragraphs[0].add_run(str(book.number) + ': ')
matchIndex = 0
for match in book.matches:
matchIndex += 1
if match.start == match.end:
runner = cells[1].paragraphs[0].add_run(str(match.start))
runner.bold = True if match.count == topMatchCount else False
else:
runner = cells[1].paragraphs[0].add_run(str(match.start) + '-' + str(match.end))
runner.bold = True if match.count == topMatchCount else False
if matchIndex != len(book.matches):
cells[1].paragraphs[0].add_run(', ')
if bookIndex != len(result.books):
cells[1].paragraphs[0].add_run('\n')
index += 1
# Hack to remove table borders
modifyBorder(table)
document.save('index.docx')
main()
@d4rkd3v1l
Copy link
Author

d4rkd3v1l commented Jul 6, 2024

Just put your keywords into "terms.txt" and the script will create an index (docx) for all pdf files in the current directory.

There will be a headline for each letter, and a table with all results for that letter.
Result format | <pdf-name: pages>.

A

 Android 1: 42, 37
adb 2: 13

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment