Created
January 31, 2023 17:20
-
-
Save swharden/82d6f36e2cd3ef7e4bd5fa97d883f506 to your computer and use it in GitHub Desktop.
Extract text from a folder of PPTX files and save the output in a HTML report
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script finds text in a folder of PPT files and saves what is found | |
in a HMTL report that can be easily searched. It separates long phrases | |
from stray words to make important content easier to spot. | |
""" | |
import datetime | |
import collections | |
import collections.abc | |
import pptx # pip install python-pptx | |
import pathlib | |
def getLines(pptxFile: pathlib.Path, minCharCount: int = 3): | |
lines = [] | |
with open(pptxFile, 'rb') as f: | |
pres = pptx.Presentation(f) | |
for slide in pres.slides: | |
for shape in slide.shapes: | |
if not shape.has_text_frame: | |
continue | |
for paragraph in shape.text_frame.paragraphs: | |
for run in paragraph.runs: | |
text = run.text.strip() | |
if (len(text) >= minCharCount): | |
lines.append(text) | |
return lines | |
def getLinesByFile(folder: pathlib.Path): | |
linesByFile = {} | |
for pptxFile in folder.glob("*.pptx"): | |
print(f"Analyzing {pptxFile.name}") | |
linesByFile[str(pptxFile)] = getLines(pptxFile) | |
return linesByFile | |
def makeReport(linesByFile: dict, saveAs: pathlib.Path): | |
html = "" | |
html += "<html><body>" | |
html += "<div style='text-align: center; margin: 3em 0;'>" | |
html += "<h1>Report</h1>" | |
html += "<div><i>This file facilitates searching for text across powerpoint files</i></div>" | |
html += f"<div style='margin: 1em 0;'><code>generated {datetime.datetime.now()}</code></div>" | |
html += "</div>" | |
for key in linesByFile: | |
filename = pathlib.Path(key).name | |
html += f"<h3>{filename}</h3>" | |
html += "<ul>" | |
minWordCount = 3 | |
phrases = [x for x in linesByFile[key] | |
if len(x.split(" ")) >= minWordCount] | |
words = [x for x in linesByFile[key] | |
if len(x.split(" ")) < minWordCount] | |
words = ", ".join(words) | |
for line in phrases: | |
html += f"<li>{line}</li>" | |
html += f"<li>Words: {words}</li>" | |
html += "</ul>" | |
html += "</body></html>" | |
with open(saveAs, 'w') as f: | |
f.write(html) | |
print(f"Saved: {saveAs}") | |
if __name__ == "__main__": | |
folderPath = pathlib.Path(R"C:\path\to\folder") | |
linesByFile = getLinesByFile(folderPath) | |
reportFilePath = folderPath.joinpath("report.html") | |
makeReport(linesByFile, reportFilePath) | |
print("DONE") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment