Skip to content

Instantly share code, notes, and snippets.

@sudoaza
Created August 26, 2024 09:27
Show Gist options
  • Save sudoaza/62e8b8b9e4cb3fe505582b61db9eee62 to your computer and use it in GitHub Desktop.
Save sudoaza/62e8b8b9e4cb3fe505582b61db9eee62 to your computer and use it in GitHub Desktop.
Quick and dirty parse PDF file and extract objects/images
import re
import zlib
import argparse
from PIL import Image
def parse_args():
parser = argparse.ArgumentParser(description='Decompress data using zlib')
parser.add_argument('pdf_file', help='The PDF file to analyze')
return parser.parse_args()
def get_objects(pdf_data):
"""Get objects offset from the beginning of the file"""
rx = rb'(\d+) (\d+) obj\b'
objects = []
for match in re.finditer(rx, pdf_data):
obj_idx = match.start()
endobj_idx = pdf_data[obj_idx:].find(b'endobj') + obj_idx +7
obj_data = pdf_data[obj_idx:endobj_idx]
definition, arguments, *others = obj_data.splitlines()
if not others:
print('No data', definition, arguments)
continue
if others[0] == b'endobj':
objects.append((definition, arguments, b''))
elif others[0] == b'stream':
stream_idx = obj_data.find(b'\nstream\n') +8
endstream_idx = obj_data.find(b'\nendstream\n')
objects.append((definition, arguments, obj_data[stream_idx:endstream_idx]))
else:
raise ValueError('Unknown object type')
return objects
def parse_pdf(pdf_data):
objects = get_objects(pdf_data)
for definition, arguments, stream in objects:
# Check if the stream is compressed
if not stream:
pass
elif b'/FlateDecode' in arguments:
stream = zlib.decompress(stream)
elif b'/LZWDecode' in arguments:
stream = zlib.decompress(stream, -15)
else:
print('Unknown compression method')
# Check if the stream is an image
if b'/Image' in arguments:
s_arguments = arguments.decode()
filename = re.sub(r'\W', '_', definition) + '.bmp'
width = int(re.search(r'/Width (\d+)', s_arguments).group(1))
height = int(re.search(r'/Height (\d+)', s_arguments).group(1))
bits_per_component = int(re.search(r'/BitsPerComponent (\d+)', s_arguments).group(1))
color_space = re.search(r'/ColorSpace/(\w+)', s_arguments).group(1).replace('Device', '')
image = Image.frombytes(color_space, (width, height), stream)
image.save(filename)
print('Image saved as', filename)
print(definition, arguments, stream[:50])
def main():
args = parse_args()
with open(args.pdf_file, 'rb') as f:
pdf_data = f.read()
assert pdf_data[:4] == b'%PDF', 'Not a PDF file'
parse_pdf(pdf_data)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment