roh26it · May 17, 2023 17:25
diff --git a/arxiv-pdf-extract.py b/arxiv-pdf-extract.py
 import requests
 import fitz  # PyMuPDF
 from io import BytesIO
 import re

 def download_pdf(url):
    response = requests.get(url)
    if response.status_code == 200:
        return BytesIO(response.content)
    else:
        raise Exception(f"Failed to download PDF. Status code: {response.status_code}")

 # def extract_text_from_pdf(pdf_stream):
 #     with fitz.open(stream=pdf_stream) as pdf:
 #         text = ""
 #         for page in pdf:
 #             page_text = page.get_text("text")
 #             # Remove lines with fewer than 20 characters
 #             filtered_lines = [line for line in page_text.splitlines() if len(line) >= 20 and not re.match(r'^\[\d+\]', line)]
 #             text += "\n".join(filtered_lines)
 #     return text

 def is_table(block, min_rows=3, min_columns=2):
    if len(block) < min_rows:
        return False

    row_1_cells = block[0]['spans']
    if len(row_1_cells) < min_columns:
        return False

    for line in block:
        if len(line['spans']) != len(row_1_cells):
            return False

    return True

 def extract_text_from_pdf(pdf_stream):
    with fitz.open(stream=pdf_stream) as pdf:
        text = ""
        for page in pdf:
            blocks = page.get_text("dict")["blocks"]
            for block in blocks:
                if block["type"] == 0:  # Text block
                    if not is_table(block["lines"]):
                        # Detect headings based on font size
                        for line in block["lines"]:
                            spans = line["spans"]
                            font_size = spans[0]["size"]
                            line_text = spans[0]["text"]

                            # Customize the threshold according to your specific use case
                            heading_threshold = 11.5
                            if font_size >= heading_threshold:
                                text += "\n\n" + line_text + "\n"
                            else:
                                text += line_text + " "
    return text

 pdf_stream = download_pdf(pdf_url)
 pdf_text = extract_text_from_pdf(pdf_stream).split("References\n")[0]
	import requests
	import fitz # PyMuPDF
	from io import BytesIO
	import re

	def download_pdf(url):
	response = requests.get(url)
	if response.status_code == 200:
	return BytesIO(response.content)
	else:
	raise Exception(f"Failed to download PDF. Status code: {response.status_code}")

	# def extract_text_from_pdf(pdf_stream):
	# with fitz.open(stream=pdf_stream) as pdf:
	# text = ""
	# for page in pdf:
	# page_text = page.get_text("text")
	# # Remove lines with fewer than 20 characters
	# filtered_lines = [line for line in page_text.splitlines() if len(line) >= 20 and not re.match(r'^\[\d+\]', line)]
	# text += "\n".join(filtered_lines)
	# return text

	def is_table(block, min_rows=3, min_columns=2):
	if len(block) < min_rows:
	return False

	row_1_cells = block[0]['spans']
	if len(row_1_cells) < min_columns:
	return False

	for line in block:
	if len(line['spans']) != len(row_1_cells):
	return False

	return True

	def extract_text_from_pdf(pdf_stream):
	with fitz.open(stream=pdf_stream) as pdf:
	text = ""
	for page in pdf:
	blocks = page.get_text("dict")["blocks"]
	for block in blocks:
	if block["type"] == 0: # Text block
	if not is_table(block["lines"]):
	# Detect headings based on font size
	for line in block["lines"]:
	spans = line["spans"]
	font_size = spans[0]["size"]
	line_text = spans[0]["text"]

	# Customize the threshold according to your specific use case
	heading_threshold = 11.5
	if font_size >= heading_threshold:
	text += "\n\n" + line_text + "\n"
	else:
	text += line_text + " "
	return text

	pdf_stream = download_pdf(pdf_url)
	pdf_text = extract_text_from_pdf(pdf_stream).split("References\n")[0]