Skip to content

Instantly share code, notes, and snippets.

@romaresccoa
Last active January 21, 2020 00:31
Show Gist options
  • Save romaresccoa/218c452d8ca6198d9fc17319a381a63a to your computer and use it in GitHub Desktop.
Save romaresccoa/218c452d8ca6198d9fc17319a381a63a to your computer and use it in GitHub Desktop.
from PyPDF2 import PdfFileReader
class Extractor:
def __init__(self, file_name: str) -> None:
self.pdf_reader = PdfFileReader(file_name)
self.sections = self._extract_sections(self.pdf_reader.outlines)
self.pages_range = self._find_pages_range()
self.raw_text = self._extract_raw_text() # useful for creating / testing new functionality
def _has_subsections(self, section):
return type(section) is list
def _propose_section(self, section):
print('Extracted section title: {}.'.format(section.title))
option = ''
while option not in ['Y', 'n']:
option = input('Do You want to include it in the summary? [Y - yes/n - no]:\n')
return True if option == 'Y' else False
def _get_sections(self, section, sections):
for subsection in section:
if self._has_subsections(subsection):
self._get_sections(subsection, sections)
elif self._propose_section(subsection):
sections.append(subsection)
def _extract_sections(self, outlines):
sections = []
for section in outlines:
if not self._has_subsections(section):
if self._propose_section(section):
sections.append(section)
else:
self._get_sections(section, sections)
return sections
def _find_pages_range(self):
# TODO: It would be great if someone could
# add code to find where references
# start so they won't be included
# in text of last section.
# TODO: Test if end variable is really the last page of the document.
start = self.pdf_reader.getDestinationPageNumber(self.sections[0])
end = self.pdf_reader.numPages() # Test if it's surely the last page
return list(range(start, end))
def _extract_raw_text(self):
# TODO: It would be great if someone could
# do more preprocessing such as...
# deleting tables etc..
text = ""
for page in self.pages_range:
text += self.pdf_reader.getPage(page).extractText()
return text.replace('\n', '').lower()
def extract_texts(self):
"""
Use this method to obtain dictionary with
section names as keys and extracted texts
as values.
"""
result_dict = {}
text_tmp = self.raw_text
# get the texts section by section
for curr_section, next_section in zip(self.sections, self.sections[1:]):
start_idx = text_tmp.find(curr_section.title.lower()) + len(curr_section.title.lower())
end_idx = text_tmp[start_idx:].find(next_section.title.lower()) + start_idx
result_dict[curr_section.title] = text_tmp[start_idx:end_idx]
text_tmp = text_tmp[end_idx:]
# this loop didn't include last section
result_dict[self.sections[-1].title] = text_tmp
return result_dict
def get_section_names(self):
return [section.title for section in self.sections]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment