Skip to content

Instantly share code, notes, and snippets.

@roh26it
Created May 17, 2023 17:25
Show Gist options
  • Save roh26it/d1d7af2432175b443355990ec640b1d5 to your computer and use it in GitHub Desktop.
Save roh26it/d1d7af2432175b443355990ec640b1d5 to your computer and use it in GitHub Desktop.
import requests
import fitz # PyMuPDF
from io import BytesIO
import re
def download_pdf(url):
response = requests.get(url)
if response.status_code == 200:
return BytesIO(response.content)
else:
raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
# def extract_text_from_pdf(pdf_stream):
# with fitz.open(stream=pdf_stream) as pdf:
# text = ""
# for page in pdf:
# page_text = page.get_text("text")
# # Remove lines with fewer than 20 characters
# filtered_lines = [line for line in page_text.splitlines() if len(line) >= 20 and not re.match(r'^\[\d+\]', line)]
# text += "\n".join(filtered_lines)
# return text
def is_table(block, min_rows=3, min_columns=2):
if len(block) < min_rows:
return False
row_1_cells = block[0]['spans']
if len(row_1_cells) < min_columns:
return False
for line in block:
if len(line['spans']) != len(row_1_cells):
return False
return True
def extract_text_from_pdf(pdf_stream):
with fitz.open(stream=pdf_stream) as pdf:
text = ""
for page in pdf:
blocks = page.get_text("dict")["blocks"]
for block in blocks:
if block["type"] == 0: # Text block
if not is_table(block["lines"]):
# Detect headings based on font size
for line in block["lines"]:
spans = line["spans"]
font_size = spans[0]["size"]
line_text = spans[0]["text"]
# Customize the threshold according to your specific use case
heading_threshold = 11.5
if font_size >= heading_threshold:
text += "\n\n" + line_text + "\n"
else:
text += line_text + " "
return text
pdf_stream = download_pdf(pdf_url)
pdf_text = extract_text_from_pdf(pdf_stream).split("References\n")[0]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment