Created
March 11, 2024 11:22
-
-
Save AnthonyZJiang/a99391f383ec045bdac9a50f75229469 to your computer and use it in GitHub Desktop.
Python script to extract PDF files embedded in Microsoft Word *.docx files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import olefile | |
from zipfile import ZipFile | |
from glob import glob | |
# How many PDF documents have we saved | |
pdf_count = 0 | |
# Loop through all the .docx files in the current folder | |
for filename in glob("*.docx"): | |
try: | |
# Try to open the document as ZIP file | |
with ZipFile(filename, "r") as zip: | |
# Find files in the word/embeddings folder of the ZIP file | |
for entry in zip.infolist(): | |
if not entry.filename.startswith("word/embeddings/"): | |
continue | |
# Try to open the embedded OLE file | |
with zip.open(entry.filename) as f: | |
if not olefile.isOleFile(f): | |
continue | |
ole = olefile.OleFileIO(f) | |
# CLSID for Adobe Acrobat Document | |
if ole.root.clsid != "B801CA65-A1FC-11D0-85AD-444553540000": | |
continue | |
if not ole.exists("CONTENTS"): | |
continue | |
# Extract the PDF from the OLE file | |
pdf_data = ole.openstream('CONTENTS').read() | |
# Does the embedded file have a %PDF- header? | |
if pdf_data[0:5] == b'%PDF-': | |
pdf_count += 1 | |
pdf_filename = "Document %d.pdf" % pdf_count | |
# Save the PDF | |
with open(pdf_filename, "wb") as output_file: | |
output_file.write(pdf_data) | |
except: | |
print("Unable to open '%s'" % filename) | |
print("Extracted %d PDF documents" % pdf_count) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment