Created
April 25, 2024 21:59
-
-
Save alpaylan/e3c8b052e43e9f6640a8ecaa6fc769d7 to your computer and use it in GitHub Desktop.
Heuristically splits a book pdf into chapters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This script takes a PDF book, splits it into chapters, and saves each chapter as a separate PDF file. | |
import PyPDF2 | |
import re | |
# Open the PDF file | |
pdf_file = open('book.pdf', 'rb') | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
# Create a PDF writer object | |
pdf_writer = PyPDF2.PdfWriter() | |
# Regular expression to match the chapter headings | |
chapter_pattern = re.compile(r'^Chapter \d+$') | |
page_lengths = [] | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
text = page.extract_text() | |
print(f'Processing page {page_num}...') | |
page_lengths.append((page_num, len(text.split(" ")), "chapter" in text.lower())) | |
print(page_lengths) | |
# Average path length | |
average_length = sum([page[1] for page in page_lengths]) / len(page_lengths) | |
# Filter pages that have chapter | |
chapter_pages = [page for page in page_lengths if page[2]] | |
print(chapter_pages) | |
print("Average length: ", average_length) | |
# Find chapter pages that are shorter than %60 percent of average length | |
short_chapter_pages = [page for page in chapter_pages if page[1] < (average_length * 0.6)] | |
print(short_chapter_pages) | |
# Partition pdf by chapter pages | |
chapter_ranges = [] | |
start = 0 | |
for i in range(len(short_chapter_pages)): | |
if i == len(short_chapter_pages) - 1: | |
chapter_ranges.append((start, short_chapter_pages[i][0])) | |
else: | |
chapter_ranges.append((start, short_chapter_pages[i][0])) | |
start = short_chapter_pages[i][0] | |
# Add the last page | |
chapter_ranges.append((start, len(pdf_reader.pages))) | |
print(chapter_ranges) | |
# Save each chapter as a separate PDF file | |
for i, (start, end) in enumerate(chapter_ranges): | |
pdf_writer = PyPDF2.PdfWriter() | |
for page_num in range(start, end): | |
pdf_writer.add_page(pdf_reader.pages[page_num]) | |
with open(f'book/chapter_{i+1}.pdf', 'wb') as pdf_output_file: | |
pdf_writer.write(pdf_output_file) | |
# Close the PDF file | |
pdf_file.close() | |
print('Chapters separated successfully.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment