Last active
July 6, 2024 13:48
-
-
Save d4rkd3v1l/3b7d1d68f28eff1a8a4eee73934f288d to your computer and use it in GitHub Desktop.
Simple Python script to index pdf files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
from dataclasses import dataclass | |
from itertools import groupby | |
from docx import Document | |
from docx.shared import Inches, Pt, Mm | |
from docx.enum.text import WD_BREAK, WD_ALIGN_PARAGRAPH | |
from docx.oxml import OxmlElement, parse_xml, ns | |
from docx.oxml.ns import nsdecls, qn | |
@dataclass | |
class Match: | |
book: int | |
page: int | |
count: int | |
@dataclass | |
class Search: | |
searchTerm: str | |
matches: list[Match] | |
@dataclass | |
class Result: | |
@dataclass | |
class Book: | |
@dataclass | |
class SequencedMatch: | |
start: int | |
end: int | |
count: int | |
number: int | |
matches: list[SequencedMatch] | |
searchTerm: str | |
books: list[Book] | |
def main(): | |
with open('terms.txt', 'r') as terms: | |
results = [] | |
while term := terms.readline(): | |
term = term.rstrip() | |
search = Search(term, []) | |
print(term) | |
pdfgrep = "pdfgrep --cache -p -o -n \"" + term + "\" * 2>/dev/null; exit 0" | |
result = subprocess.check_output(pdfgrep, stderr=subprocess.STDOUT, shell=True) | |
result = result.decode('UTF-8').replace('.pdf', '') | |
for resultLine in result.splitlines(): | |
resultElements = resultLine.split(':') | |
resultPage = int(resultElements[1])-2 # Offset, when page numbering doesn't start on the first page | |
if resultPage > 0: | |
match = Match(int(resultElements[0]), resultPage, int(resultElements[2])) | |
search.matches.append(match) | |
if search.matches: | |
finalResult = sequenceMatches(search) | |
results.append(finalResult) | |
else: | |
print("-------------- No result for \"" + search.searchTerm + "\" --------------") | |
createDocx(results) | |
def sequences(matches: list[Match]) -> [Result.Book.SequencedMatch]: | |
start, end, count = matches[0], matches[0], 0 | |
currentPage = start.page | |
for match in matches: | |
if currentPage != match.page: | |
yield Result.Book.SequencedMatch(start.page, end.page, count) | |
start, end, count = match, match, 0 | |
currentPage = match.page | |
end = match | |
count += match.count | |
currentPage += 1 | |
yield Result.Book.SequencedMatch(start.page, end.page, count) | |
def sequenceMatches(search: Search) -> Result: | |
result = Result(search.searchTerm, []) | |
for bookNumber, matches in groupby(search.matches, lambda x: x.book): | |
matches = list(matches) | |
book = Result.Book(bookNumber, list(sequences(matches))) | |
result.books.append(book) | |
return result | |
# https://stackoverflow.com/a/56676220/2019384 | |
def create_element(name): | |
return OxmlElement(name) | |
def create_attribute(element, name, value): | |
element.set(ns.qn(name), value) | |
def add_page_number(run): | |
fldChar1 = create_element('w:fldChar') | |
create_attribute(fldChar1, 'w:fldCharType', 'begin') | |
instrText = create_element('w:instrText') | |
create_attribute(instrText, 'xml:space', 'preserve') | |
instrText.text = "PAGE" | |
fldChar2 = create_element('w:fldChar') | |
create_attribute(fldChar2, 'w:fldCharType', 'end') | |
run._r.append(fldChar1) | |
run._r.append(instrText) | |
run._r.append(fldChar2) | |
# https://github.com/python-openxml/python-docx/issues/433#issuecomment-358566765 | |
def modifyBorder(table): | |
tbl = table._tbl # get xml element in table | |
for cell in tbl.iter_tcs(): | |
tcPr = cell.tcPr # get tcPr element, in which we can define style of borders | |
tcBorders = OxmlElement('w:tcBorders') | |
top = OxmlElement('w:top') | |
top.set(qn('w:val'), 'nil') | |
left = OxmlElement('w:left') | |
left.set(qn('w:val'), 'nil') | |
bottom = OxmlElement('w:bottom') | |
bottom.set(qn('w:val'), 'nil') | |
right = OxmlElement('w:right') | |
right.set(qn('w:val'), 'nil') | |
tcBorders.append(top) | |
tcBorders.append(left) | |
tcBorders.append(bottom) | |
tcBorders.append(right) | |
tcPr.append(tcBorders) | |
def createDocx(results: list[Result]): | |
document = Document() | |
style = document.styles['Normal'] | |
style.font.name = "Helvetica Neue" | |
style.font.size = Pt(9) | |
style.paragraph_format.space_before = Pt(2) | |
style.paragraph_format.space_after = Pt(2) | |
# Make it DIN A4 | |
section = document.sections[0] | |
section.page_height = Mm(297) | |
section.page_width = Mm(210) | |
section.left_margin = Mm(25.4) | |
section.right_margin = Mm(25.4) | |
section.top_margin = Mm(25.4) | |
section.bottom_margin = Mm(25.4) | |
section.header_distance = Mm(12.7) | |
section.footer_distance = Mm(12.7) | |
document.add_heading('Index', 0) | |
footer = document.sections[0].footer.paragraphs[0] | |
footer.alignment = WD_ALIGN_PARAGRAPH.RIGHT | |
add_page_number(footer.add_run()) | |
results.sort(key=lambda x: x.searchTerm.upper(), reverse=False) | |
previousLetter = '' | |
for result in results: | |
topMatchCount = max(map(lambda book: max(map(lambda match: match.count, book.matches)), result.books)) | |
currentLetter = result.searchTerm[0].upper() | |
if previousLetter != currentLetter: | |
if currentLetter.isalpha(): | |
paragraph = document.add_paragraph() | |
run = paragraph.add_run() | |
run.add_break(WD_BREAK.PAGE) | |
document.add_heading(currentLetter, 1) | |
table = document.add_table(rows = 0, cols = 2) | |
table.style = 'Table Grid' | |
index = 0 | |
previousLetter = currentLetter | |
table.add_row() | |
# Hack for alternating background colors^^ | |
if index % 2 == 1: | |
shading_elm_1 = parse_xml(r'<w:shd {} w:fill="FFFFFF"/>'.format(nsdecls('w'))) | |
table.rows[index].cells[0]._tc.get_or_add_tcPr().append(shading_elm_1) | |
shading_elm_2 = parse_xml(r'<w:shd {} w:fill="FFFFFF"/>'.format(nsdecls('w'))) | |
table.rows[index].cells[1]._tc.get_or_add_tcPr().append(shading_elm_2) | |
else: | |
shading_elm_1 = parse_xml(r'<w:shd {} w:fill="F0F0F0"/>'.format(nsdecls('w'))) | |
table.rows[index].cells[0]._tc.get_or_add_tcPr().append(shading_elm_1) | |
shading_elm_2 = parse_xml(r'<w:shd {} w:fill="F0F0F0"/>'.format(nsdecls('w'))) | |
table.rows[index].cells[1]._tc.get_or_add_tcPr().append(shading_elm_2) | |
cells = table.rows[index].cells | |
cells[0].text = result.searchTerm | |
bookIndex = 0 | |
for book in result.books: | |
bookIndex += 1 | |
cells[1].paragraphs[0].add_run(str(book.number) + ': ') | |
matchIndex = 0 | |
for match in book.matches: | |
matchIndex += 1 | |
if match.start == match.end: | |
runner = cells[1].paragraphs[0].add_run(str(match.start)) | |
runner.bold = True if match.count == topMatchCount else False | |
else: | |
runner = cells[1].paragraphs[0].add_run(str(match.start) + '-' + str(match.end)) | |
runner.bold = True if match.count == topMatchCount else False | |
if matchIndex != len(book.matches): | |
cells[1].paragraphs[0].add_run(', ') | |
if bookIndex != len(result.books): | |
cells[1].paragraphs[0].add_run('\n') | |
index += 1 | |
# Hack to remove table borders | |
modifyBorder(table) | |
document.save('index.docx') | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Just put your keywords into "terms.txt" and the script will create an index (docx) for all pdf files in the current directory.
There will be a headline for each letter, and a table with all results for that letter.
Result format | <pdf-name: pages>.
A