Skip to content

Instantly share code, notes, and snippets.

@tlkahn
Forked from tilusnet/parse_toc.py
Created March 2, 2020 10:13
Show Gist options
  • Save tlkahn/d587376ab8987067457a478f421ca64d to your computer and use it in GitHub Desktop.
Save tlkahn/d587376ab8987067457a478f421ca64d to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# parse_toc.py
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
def parse(filename, maxlevel):
fp = open(filename, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
outlines = doc.get_outlines()
for (level, title, dest, a, se) in outlines:
if level <= maxlevel:
title_words = title \
.encode('utf8') \
.replace(b'\n', b'') \
.decode('utf8') \
.split()[1:]
title = ' '.join(title_words)
print('*' * level, title)
if __name__ == '__main__':
import sys
if len(sys.argv) != 3:
print('Usage: %s xxx.pdf level' % sys.argv[0])
sys.exit(2)
parse(sys.argv[1], int(sys.argv[2]))
@tlkahn
Copy link
Author

tlkahn commented Mar 2, 2020

upgrade to work with python 3.7

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment