Created
July 12, 2024 18:18
-
-
Save htlin222/7716697f591fd7d78918623f02bb4150 to your computer and use it in GitHub Desktop.
This script splits a PDF file into individual pages and saves each page as a Markdown file with an auto-generated title and explanation in Traditional Chinese. It uses the OpenAI API to generate titles and explanations.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# title: pdf_explain | |
# author: Hsieh-Ting Lin, the Lizard 🦎 | |
# description: This script splits a PDF file into individual pages and saves each page as a Markdown file with an auto-generated title and explanation in Traditional Chinese. It uses the OpenAI API to generate titles and explanations. | |
# date: "2024-07-13" | |
import os | |
import openai | |
import PyPDF2 | |
# 初始化 OpenAI 客戶端 | |
client = openai.OpenAI() | |
def generate_title(page_text): | |
response = client.chat.completions.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{ | |
"role": "user", | |
"content": f"Generate a concise title for the following page content:\n\n{page_text}", | |
} | |
], | |
) | |
result = response.choices[0].message.content.strip('"') | |
return result | |
def generate_explain(page_text): | |
response = client.chat.completions.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{ | |
"role": "user", | |
"content": f"以繁體中文解釋一下:\n\n{page_text}", | |
} | |
], | |
) | |
result = response.choices[0].message.content.strip() | |
return result | |
def add_title_and_explain_to_page(page_text, title, explain): | |
return f"## {title}\n\n{page_text}\n\n<!-- {explain} -->" | |
def split_pdf(file_path): | |
try: | |
pdf_file = open(file_path, "rb") | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
total_pages = len(pdf_reader.pages) | |
base_name = os.path.splitext(os.path.basename(file_path))[0] | |
output_dir = os.path.join(os.getcwd(), base_name) | |
if not os.path.exists(output_dir): | |
os.makedirs(output_dir) | |
combined_content = "" | |
for page_num in range(total_pages): | |
page = pdf_reader.pages[page_num] | |
page_text = page.extract_text() | |
title = generate_title(page_text) | |
explain = generate_explain(page_text) | |
new_page_text = add_title_and_explain_to_page(page_text, title, explain) | |
output_filename = os.path.join(output_dir, f"page_{page_num + 1:03}.md") | |
with open(output_filename, "w", encoding="utf-8") as output_file: | |
output_file.write(new_page_text) | |
combined_content += new_page_text + "\n\n---\n\n" | |
print(f"Created: {output_filename}") | |
# Save the combined content to all.md | |
all_filename = os.path.join(output_dir, "all.md") | |
with open(all_filename, "w", encoding="utf-8") as all_file: | |
all_file.write(combined_content) | |
print(f"Created: {all_filename}") | |
pdf_file.close() | |
except FileNotFoundError: | |
print(f"File not found: {file_path}") | |
except PyPDF2.errors.PdfReadError: | |
print(f"Error reading the PDF file: {file_path}") | |
except Exception as e: | |
print(f"An unexpected error occurred: {e}") | |
if __name__ == "__main__": | |
import argparse | |
parser = argparse.ArgumentParser( | |
description="Split PDF into individual pages and save as Markdown files." | |
) | |
parser.add_argument("-f", "--file", required=True, help="Path to the PDF file") | |
args = parser.parse_args() | |
split_pdf(args.file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Here is a brief description of its key components:
Initialization: Import necessary libraries, including
os
,openai
, andPyPDF2
. Initialize the OpenAI client.Title Generation: The
generate_title
function takes the text of a page and uses OpenAI to create a concise title.Explanation Generation: The
generate_explain
function generates an explanation in Traditional Chinese for the given page text using OpenAI.Add Title and Explanation: The
add_title_and_explain_to_page
function formats the page text with the generated title and explanation.PDF Splitting: The
split_pdf
function reads the PDF file, processes each page to generate titles and explanations, and saves the result as Markdown files in a directory named after the PDF file.Main Execution: The script accepts a PDF file path as a command-line argument and calls
split_pdf
to process the file.This script facilitates the extraction and annotation of PDF content, making it more accessible and comprehensible.