Created
May 17, 2023 17:25
-
-
Save roh26it/d1d7af2432175b443355990ec640b1d5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import fitz # PyMuPDF | |
from io import BytesIO | |
import re | |
def download_pdf(url): | |
response = requests.get(url) | |
if response.status_code == 200: | |
return BytesIO(response.content) | |
else: | |
raise Exception(f"Failed to download PDF. Status code: {response.status_code}") | |
# def extract_text_from_pdf(pdf_stream): | |
# with fitz.open(stream=pdf_stream) as pdf: | |
# text = "" | |
# for page in pdf: | |
# page_text = page.get_text("text") | |
# # Remove lines with fewer than 20 characters | |
# filtered_lines = [line for line in page_text.splitlines() if len(line) >= 20 and not re.match(r'^\[\d+\]', line)] | |
# text += "\n".join(filtered_lines) | |
# return text | |
def is_table(block, min_rows=3, min_columns=2): | |
if len(block) < min_rows: | |
return False | |
row_1_cells = block[0]['spans'] | |
if len(row_1_cells) < min_columns: | |
return False | |
for line in block: | |
if len(line['spans']) != len(row_1_cells): | |
return False | |
return True | |
def extract_text_from_pdf(pdf_stream): | |
with fitz.open(stream=pdf_stream) as pdf: | |
text = "" | |
for page in pdf: | |
blocks = page.get_text("dict")["blocks"] | |
for block in blocks: | |
if block["type"] == 0: # Text block | |
if not is_table(block["lines"]): | |
# Detect headings based on font size | |
for line in block["lines"]: | |
spans = line["spans"] | |
font_size = spans[0]["size"] | |
line_text = spans[0]["text"] | |
# Customize the threshold according to your specific use case | |
heading_threshold = 11.5 | |
if font_size >= heading_threshold: | |
text += "\n\n" + line_text + "\n" | |
else: | |
text += line_text + " " | |
return text | |
pdf_stream = download_pdf(pdf_url) | |
pdf_text = extract_text_from_pdf(pdf_stream).split("References\n")[0] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment