d4rkd3v1l · July 6, 2024 13:48 · d4rkd3v1l · Jul 6, 2024
diff --git a/create-index.py b/create-index.py
 import subprocess
 from dataclasses import dataclass
 from itertools import groupby
 from docx import Document
 from docx.shared import Inches, Pt, Mm
 from docx.enum.text import WD_BREAK, WD_ALIGN_PARAGRAPH
 from docx.oxml import OxmlElement, parse_xml, ns
 from docx.oxml.ns import nsdecls, qn

 @dataclass
 class Match:
    book: int
    page: int
    count: int

 @dataclass
 class Search:
    searchTerm: str
    matches: list[Match]

 @dataclass
 class Result:
    @dataclass
    class Book:
        @dataclass
        class SequencedMatch:
            start: int
            end: int
            count: int
    
        number: int
        matches: list[SequencedMatch]

    searchTerm: str 
    books: list[Book]

 def main():
    with open('terms.txt', 'r') as terms:
        results = []

        while term := terms.readline():
            term = term.rstrip()

            search = Search(term, [])
            print(term)

            pdfgrep = "pdfgrep --cache -p -o -n \"" + term + "\" * 2>/dev/null; exit 0"
            result = subprocess.check_output(pdfgrep, stderr=subprocess.STDOUT, shell=True)
            result = result.decode('UTF-8').replace('.pdf', '')

            for resultLine in result.splitlines():
                resultElements = resultLine.split(':')
                resultPage = int(resultElements[1])-2 # Offset, when page numbering doesn't start on the first page
                if resultPage > 0:
                    match = Match(int(resultElements[0]), resultPage, int(resultElements[2]))
                    search.matches.append(match)

            if search.matches:
                finalResult = sequenceMatches(search)
                results.append(finalResult)
            else:
                print("-------------- No result for \"" + search.searchTerm + "\" --------------")

        createDocx(results)

 def sequences(matches: list[Match]) -> [Result.Book.SequencedMatch]:
    start, end, count = matches[0], matches[0], 0
    currentPage = start.page

    for match in matches:
        if currentPage != match.page:
            yield Result.Book.SequencedMatch(start.page, end.page, count)
            start, end, count = match, match, 0
            currentPage = match.page
        end = match
        count += match.count
        currentPage += 1
    yield Result.Book.SequencedMatch(start.page, end.page, count)

 def sequenceMatches(search: Search) -> Result:
    result = Result(search.searchTerm, [])

    for bookNumber, matches in groupby(search.matches, lambda x: x.book):
        matches = list(matches)
        book = Result.Book(bookNumber, list(sequences(matches)))
        result.books.append(book)

    return result

 # https://stackoverflow.com/a/56676220/2019384
 def create_element(name):
    return OxmlElement(name)

 def create_attribute(element, name, value):
    element.set(ns.qn(name), value)

 def add_page_number(run):
    fldChar1 = create_element('w:fldChar')
    create_attribute(fldChar1, 'w:fldCharType', 'begin')

    instrText = create_element('w:instrText')
    create_attribute(instrText, 'xml:space', 'preserve')
    instrText.text = "PAGE"

    fldChar2 = create_element('w:fldChar')
    create_attribute(fldChar2, 'w:fldCharType', 'end')

    run._r.append(fldChar1)
    run._r.append(instrText)
    run._r.append(fldChar2)

 # https://github.com/python-openxml/python-docx/issues/433#issuecomment-358566765
 def modifyBorder(table):
    tbl = table._tbl # get xml element in table
    for cell in tbl.iter_tcs():
        tcPr = cell.tcPr # get tcPr element, in which we can define style of borders
        tcBorders = OxmlElement('w:tcBorders')
        top = OxmlElement('w:top')
        top.set(qn('w:val'), 'nil')
        
        left = OxmlElement('w:left')
        left.set(qn('w:val'), 'nil')
        
        bottom = OxmlElement('w:bottom')
        bottom.set(qn('w:val'), 'nil')

        right = OxmlElement('w:right')
        right.set(qn('w:val'), 'nil')

        tcBorders.append(top)
        tcBorders.append(left)
        tcBorders.append(bottom)
        tcBorders.append(right)
        tcPr.append(tcBorders)

 def createDocx(results: list[Result]):
    document = Document()
    style = document.styles['Normal']
    style.font.name = "Helvetica Neue"
    style.font.size = Pt(9)
    style.paragraph_format.space_before = Pt(2)
    style.paragraph_format.space_after = Pt(2)

    # Make it DIN A4
    section = document.sections[0]
    section.page_height = Mm(297)
    section.page_width = Mm(210)
    section.left_margin = Mm(25.4)
    section.right_margin = Mm(25.4)
    section.top_margin = Mm(25.4)
    section.bottom_margin = Mm(25.4)
    section.header_distance = Mm(12.7)
    section.footer_distance = Mm(12.7)

    document.add_heading('Index', 0)
    footer = document.sections[0].footer.paragraphs[0]
    footer.alignment = WD_ALIGN_PARAGRAPH.RIGHT
    add_page_number(footer.add_run())

    results.sort(key=lambda x: x.searchTerm.upper(), reverse=False)
    previousLetter = ''

    for result in results:
        topMatchCount = max(map(lambda book: max(map(lambda match: match.count, book.matches)), result.books))

        currentLetter = result.searchTerm[0].upper()
        if previousLetter != currentLetter:
            if currentLetter.isalpha():
                paragraph = document.add_paragraph()
                run = paragraph.add_run()
                run.add_break(WD_BREAK.PAGE)
            document.add_heading(currentLetter, 1)
            table = document.add_table(rows = 0, cols = 2)
            table.style = 'Table Grid'
            index = 0
            previousLetter = currentLetter

        table.add_row()

        # Hack for alternating background colors^^
        if index % 2 == 1:
            shading_elm_1 = parse_xml(r'<w:shd {} w:fill="FFFFFF"/>'.format(nsdecls('w')))
            table.rows[index].cells[0]._tc.get_or_add_tcPr().append(shading_elm_1)
            shading_elm_2 = parse_xml(r'<w:shd {} w:fill="FFFFFF"/>'.format(nsdecls('w')))
            table.rows[index].cells[1]._tc.get_or_add_tcPr().append(shading_elm_2)
        else:
            shading_elm_1 = parse_xml(r'<w:shd {} w:fill="F0F0F0"/>'.format(nsdecls('w')))
            table.rows[index].cells[0]._tc.get_or_add_tcPr().append(shading_elm_1)
            shading_elm_2 = parse_xml(r'<w:shd {} w:fill="F0F0F0"/>'.format(nsdecls('w')))
            table.rows[index].cells[1]._tc.get_or_add_tcPr().append(shading_elm_2)


        cells = table.rows[index].cells
        cells[0].text = result.searchTerm

        bookIndex = 0
        for book in result.books:
            bookIndex += 1
            cells[1].paragraphs[0].add_run(str(book.number) + ': ')

            matchIndex = 0
            for match in book.matches:
                matchIndex += 1

                if match.start == match.end:
                    runner = cells[1].paragraphs[0].add_run(str(match.start))
                    runner.bold = True if match.count == topMatchCount else False
                else:
                    runner = cells[1].paragraphs[0].add_run(str(match.start) + '-' + str(match.end))
                    runner.bold = True if match.count == topMatchCount else False

                if matchIndex != len(book.matches):
                    cells[1].paragraphs[0].add_run(', ')


            if bookIndex != len(result.books):
                cells[1].paragraphs[0].add_run('\n')

        index += 1

        # Hack to remove table borders
        modifyBorder(table)

    document.save('index.docx')
        
 main()
	import subprocess
	from dataclasses import dataclass
	from itertools import groupby
	from docx import Document
	from docx.shared import Inches, Pt, Mm
	from docx.enum.text import WD_BREAK, WD_ALIGN_PARAGRAPH
	from docx.oxml import OxmlElement, parse_xml, ns
	from docx.oxml.ns import nsdecls, qn

	@dataclass
	class Match:
	book: int
	page: int
	count: int

	@dataclass
	class Search:
	searchTerm: str
	matches: list[Match]

	@dataclass
	class Result:
	@dataclass
	class Book:
	@dataclass
	class SequencedMatch:
	start: int
	end: int
	count: int

	number: int
	matches: list[SequencedMatch]

	searchTerm: str
	books: list[Book]

	def main():
	with open('terms.txt', 'r') as terms:
	results = []

	while term := terms.readline():
	term = term.rstrip()

	search = Search(term, [])
	print(term)

	pdfgrep = "pdfgrep --cache -p -o -n \"" + term + "\" * 2>/dev/null; exit 0"
	result = subprocess.check_output(pdfgrep, stderr=subprocess.STDOUT, shell=True)
	result = result.decode('UTF-8').replace('.pdf', '')

	for resultLine in result.splitlines():
	resultElements = resultLine.split(':')
	resultPage = int(resultElements[1])-2 # Offset, when page numbering doesn't start on the first page
	if resultPage > 0:
	match = Match(int(resultElements[0]), resultPage, int(resultElements[2]))
	search.matches.append(match)

	if search.matches:
	finalResult = sequenceMatches(search)
	results.append(finalResult)
	else:
	print("-------------- No result for \"" + search.searchTerm + "\" --------------")

	createDocx(results)

	def sequences(matches: list[Match]) -> [Result.Book.SequencedMatch]:
	start, end, count = matches[0], matches[0], 0
	currentPage = start.page

	for match in matches:
	if currentPage != match.page:
	yield Result.Book.SequencedMatch(start.page, end.page, count)
	start, end, count = match, match, 0
	currentPage = match.page
	end = match
	count += match.count
	currentPage += 1
	yield Result.Book.SequencedMatch(start.page, end.page, count)

	def sequenceMatches(search: Search) -> Result:
	result = Result(search.searchTerm, [])

	for bookNumber, matches in groupby(search.matches, lambda x: x.book):
	matches = list(matches)
	book = Result.Book(bookNumber, list(sequences(matches)))
	result.books.append(book)

	return result

	# https://stackoverflow.com/a/56676220/2019384
	def create_element(name):
	return OxmlElement(name)

	def create_attribute(element, name, value):
	element.set(ns.qn(name), value)

	def add_page_number(run):
	fldChar1 = create_element('w:fldChar')
	create_attribute(fldChar1, 'w:fldCharType', 'begin')

	instrText = create_element('w:instrText')
	create_attribute(instrText, 'xml:space', 'preserve')
	instrText.text = "PAGE"

	fldChar2 = create_element('w:fldChar')
	create_attribute(fldChar2, 'w:fldCharType', 'end')

	run._r.append(fldChar1)
	run._r.append(instrText)
	run._r.append(fldChar2)

	# https://github.com/python-openxml/python-docx/issues/433#issuecomment-358566765
	def modifyBorder(table):
	tbl = table._tbl # get xml element in table
	for cell in tbl.iter_tcs():
	tcPr = cell.tcPr # get tcPr element, in which we can define style of borders
	tcBorders = OxmlElement('w:tcBorders')
	top = OxmlElement('w:top')
	top.set(qn('w:val'), 'nil')

	left = OxmlElement('w:left')
	left.set(qn('w:val'), 'nil')

	bottom = OxmlElement('w:bottom')
	bottom.set(qn('w:val'), 'nil')

	right = OxmlElement('w:right')
	right.set(qn('w:val'), 'nil')

	tcBorders.append(top)
	tcBorders.append(left)
	tcBorders.append(bottom)
	tcBorders.append(right)
	tcPr.append(tcBorders)

	def createDocx(results: list[Result]):
	document = Document()
	style = document.styles['Normal']
	style.font.name = "Helvetica Neue"
	style.font.size = Pt(9)
	style.paragraph_format.space_before = Pt(2)
	style.paragraph_format.space_after = Pt(2)

	# Make it DIN A4
	section = document.sections[0]
	section.page_height = Mm(297)
	section.page_width = Mm(210)
	section.left_margin = Mm(25.4)
	section.right_margin = Mm(25.4)
	section.top_margin = Mm(25.4)
	section.bottom_margin = Mm(25.4)
	section.header_distance = Mm(12.7)
	section.footer_distance = Mm(12.7)

	document.add_heading('Index', 0)
	footer = document.sections[0].footer.paragraphs[0]
	footer.alignment = WD_ALIGN_PARAGRAPH.RIGHT
	add_page_number(footer.add_run())

	results.sort(key=lambda x: x.searchTerm.upper(), reverse=False)
	previousLetter = ''

	for result in results:
	topMatchCount = max(map(lambda book: max(map(lambda match: match.count, book.matches)), result.books))

	currentLetter = result.searchTerm[0].upper()
	if previousLetter != currentLetter:
	if currentLetter.isalpha():
	paragraph = document.add_paragraph()
	run = paragraph.add_run()
	run.add_break(WD_BREAK.PAGE)
	document.add_heading(currentLetter, 1)
	table = document.add_table(rows = 0, cols = 2)
	table.style = 'Table Grid'
	index = 0
	previousLetter = currentLetter

	table.add_row()

	# Hack for alternating background colors^^
	if index % 2 == 1:
	shading_elm_1 = parse_xml(r'<w:shd {} w:fill="FFFFFF"/>'.format(nsdecls('w')))
	table.rows[index].cells[0]._tc.get_or_add_tcPr().append(shading_elm_1)
	shading_elm_2 = parse_xml(r'<w:shd {} w:fill="FFFFFF"/>'.format(nsdecls('w')))
	table.rows[index].cells[1]._tc.get_or_add_tcPr().append(shading_elm_2)
	else:
	shading_elm_1 = parse_xml(r'<w:shd {} w:fill="F0F0F0"/>'.format(nsdecls('w')))
	table.rows[index].cells[0]._tc.get_or_add_tcPr().append(shading_elm_1)
	shading_elm_2 = parse_xml(r'<w:shd {} w:fill="F0F0F0"/>'.format(nsdecls('w')))
	table.rows[index].cells[1]._tc.get_or_add_tcPr().append(shading_elm_2)


	cells = table.rows[index].cells
	cells[0].text = result.searchTerm

	bookIndex = 0
	for book in result.books:
	bookIndex += 1
	cells[1].paragraphs[0].add_run(str(book.number) + ': ')

	matchIndex = 0
	for match in book.matches:
	matchIndex += 1

	if match.start == match.end:
	runner = cells[1].paragraphs[0].add_run(str(match.start))
	runner.bold = True if match.count == topMatchCount else False
	else:
	runner = cells[1].paragraphs[0].add_run(str(match.start) + '-' + str(match.end))
	runner.bold = True if match.count == topMatchCount else False

	if matchIndex != len(book.matches):
	cells[1].paragraphs[0].add_run(', ')


	if bookIndex != len(result.books):
	cells[1].paragraphs[0].add_run('\n')

	index += 1

	# Hack to remove table borders
	modifyBorder(table)

	document.save('index.docx')

	main()