shentonfreude · July 11, 2019 23:49
diff --git a/pdf2jpg.py b/pdf2jpg.py
 #!/usr/bin/env python3
 # Extract jpg's from pdf's. Quick and dirty.
 # Adapted from https://nedbatchelder.com/blog/200712/extracting_jpgs_from_pdfs.html

 import sys

 pdf = open(sys.argv[1], "rb").read()


 startmark = b"\xff\xd8"
 startfix = 0
 endmark = b"\xff\xd9"
 endfix = 2
 i = 0

 njpg = 0
 while True:
    istream = pdf.find(b"stream", i)
    if istream < 0:
        break
    istart = pdf.find(startmark, istream, istream + 20)
    if istart < 0:
        i = istream + 20
        continue
    iend = pdf.find(b"endstream", istart)
    if iend < 0:
        raise Exception("Did not find end of stream!")
    iend = pdf.find(endmark, iend - 20)
    if iend < 0:
        raise Exception("Did not find end of JPG!")
    istart += startfix
    iend += endfix
    print("JPG %d from %d to %d" % (njpg, istart, iend))
    jpg = pdf[istart:iend]
    jpgfile = open("jpg%d.jpg" % njpg, "wb")
    jpgfile.write(jpg)
    jpgfile.close()

    njpg += 1
    i = iend
	#!/usr/bin/env python3
	# Extract jpg's from pdf's. Quick and dirty.
	# Adapted from https://nedbatchelder.com/blog/200712/extracting_jpgs_from_pdfs.html

	import sys

	pdf = open(sys.argv[1], "rb").read()


	startmark = b"\xff\xd8"
	startfix = 0
	endmark = b"\xff\xd9"
	endfix = 2
	i = 0

	njpg = 0
	while True:
	istream = pdf.find(b"stream", i)
	if istream < 0:
	break
	istart = pdf.find(startmark, istream, istream + 20)
	if istart < 0:
	i = istream + 20
	continue
	iend = pdf.find(b"endstream", istart)
	if iend < 0:
	raise Exception("Did not find end of stream!")
	iend = pdf.find(endmark, iend - 20)
	if iend < 0:
	raise Exception("Did not find end of JPG!")
	istart += startfix
	iend += endfix
	print("JPG %d from %d to %d" % (njpg, istart, iend))
	jpg = pdf[istart:iend]
	jpgfile = open("jpg%d.jpg" % njpg, "wb")
	jpgfile.write(jpg)
	jpgfile.close()

	njpg += 1
	i = iend