swharden · January 31, 2023 17:20
diff --git a/pptx-to-html.py b/pptx-to-html.py
 """
 This script finds text in a folder of PPT files and saves what is found
 in a HMTL report that can be easily searched. It separates long phrases
 from stray words to make important content easier to spot.
 """

 import datetime
 import collections
 import collections.abc

 import pptx  # pip install python-pptx
 import pathlib


 def getLines(pptxFile: pathlib.Path, minCharCount: int = 3):
    lines = []
    with open(pptxFile, 'rb') as f:
        pres = pptx.Presentation(f)
        for slide in pres.slides:
            for shape in slide.shapes:
                if not shape.has_text_frame:
                    continue
                for paragraph in shape.text_frame.paragraphs:
                    for run in paragraph.runs:
                        text = run.text.strip()
                        if (len(text) >= minCharCount):
                            lines.append(text)
    return lines


 def getLinesByFile(folder: pathlib.Path):
    linesByFile = {}
    for pptxFile in folder.glob("*.pptx"):
        print(f"Analyzing {pptxFile.name}")
        linesByFile[str(pptxFile)] = getLines(pptxFile)
    return linesByFile


 def makeReport(linesByFile: dict, saveAs: pathlib.Path):
    html = ""
    html += "<html><body>"
    html += "<div style='text-align: center; margin: 3em 0;'>"
    html += "<h1>Report</h1>"
    html += "<div><i>This file facilitates searching for text across powerpoint files</i></div>"
    html += f"<div style='margin: 1em 0;'><code>generated {datetime.datetime.now()}</code></div>"
    html += "</div>"
    for key in linesByFile:
        filename = pathlib.Path(key).name
        html += f"<h3>{filename}</h3>"
        html += "<ul>"
        minWordCount = 3
        phrases = [x for x in linesByFile[key]
                   if len(x.split(" ")) >= minWordCount]
        words = [x for x in linesByFile[key]
                 if len(x.split(" ")) < minWordCount]
        words = ", ".join(words)
        for line in phrases:
            html += f"<li>{line}</li>"
        html += f"<li>Words: {words}</li>"
        html += "</ul>"
    html += "</body></html>"
    with open(saveAs, 'w') as f:
        f.write(html)
    print(f"Saved: {saveAs}")


 if __name__ == "__main__":
    folderPath = pathlib.Path(R"C:\path\to\folder")
    linesByFile = getLinesByFile(folderPath)
    reportFilePath = folderPath.joinpath("report.html")
    makeReport(linesByFile, reportFilePath)
    print("DONE")
	"""
	This script finds text in a folder of PPT files and saves what is found
	in a HMTL report that can be easily searched. It separates long phrases
	from stray words to make important content easier to spot.
	"""

	import datetime
	import collections
	import collections.abc

	import pptx # pip install python-pptx
	import pathlib


	def getLines(pptxFile: pathlib.Path, minCharCount: int = 3):
	lines = []
	with open(pptxFile, 'rb') as f:
	pres = pptx.Presentation(f)
	for slide in pres.slides:
	for shape in slide.shapes:
	if not shape.has_text_frame:
	continue
	for paragraph in shape.text_frame.paragraphs:
	for run in paragraph.runs:
	text = run.text.strip()
	if (len(text) >= minCharCount):
	lines.append(text)
	return lines


	def getLinesByFile(folder: pathlib.Path):
	linesByFile = {}
	for pptxFile in folder.glob("*.pptx"):
	print(f"Analyzing {pptxFile.name}")
	linesByFile[str(pptxFile)] = getLines(pptxFile)
	return linesByFile


	def makeReport(linesByFile: dict, saveAs: pathlib.Path):
	html = ""
	html += "<html><body>"
	html += "<div style='text-align: center; margin: 3em 0;'>"
	html += "<h1>Report</h1>"
	html += "<div><i>This file facilitates searching for text across powerpoint files</i></div>"
	html += f"<div style='margin: 1em 0;'><code>generated {datetime.datetime.now()}</code></div>"
	html += "</div>"
	for key in linesByFile:
	filename = pathlib.Path(key).name
	html += f"<h3>{filename}</h3>"
	html += "<ul>"
	minWordCount = 3
	phrases = [x for x in linesByFile[key]
	if len(x.split(" ")) >= minWordCount]
	words = [x for x in linesByFile[key]
	if len(x.split(" ")) < minWordCount]
	words = ", ".join(words)
	for line in phrases:
	html += f"<li>{line}</li>"
	html += f"<li>Words: {words}</li>"
	html += "</ul>"
	html += "</body></html>"
	with open(saveAs, 'w') as f:
	f.write(html)
	print(f"Saved: {saveAs}")


	if __name__ == "__main__":
	folderPath = pathlib.Path(R"C:\path\to\folder")
	linesByFile = getLinesByFile(folderPath)
	reportFilePath = folderPath.joinpath("report.html")
	makeReport(linesByFile, reportFilePath)
	print("DONE")