Created
December 7, 2023 18:34
-
-
Save buckmaxwell/488f46f4a85400e2dc6d44d3f1cc7a5b to your computer and use it in GitHub Desktop.
AI Research Paper Downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import requests | |
import glob | |
import time | |
import os | |
import hashlib | |
from urllib.request import urlretrieve | |
from PyPDF2 import PdfReader | |
from PyPDF2.errors import PdfReadError | |
import logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
base_url = "https://api.semanticscholar.org/graph/v1/paper/search" | |
download_folder = "/Users/maxbuck/Desktop/autopapers" | |
"Artificial Intelligence" | |
"Machine Learning" | |
"Deep Learning" | |
"Natural Language Processing" | |
"Conversational AI" | |
"Transformer Models" | |
params = { | |
# "query": "ICLR", | |
# "query": "Model Architectures", | |
# "query": "quantum computing", | |
# "query": "Artificial Intelligence", | |
"query": "Artificial Intelligence", | |
"year": "2022-", | |
"openAccessPdf": True, | |
# "fieldsOfStudy": "Computer Science,Biology,Medicine,Engineering", | |
"fieldsOfStudy": "Computer Science", | |
"fields": "title,year,authors,openAccessPdf,referenceCount,citationCount,influentialCitationCount,abstract,tldr,venue", | |
# "venue": "Annual Conference on Neural Information Processing Systems", | |
# "venue": "NeurIPS", | |
# "venue": "International Conference on Machine Learning", | |
# "venue": "Neural Information Processing Systems", | |
"venue": "J. Mach. Learn. Res.,IEEE Trans. Pattern Anal. Mach. Intell.,Artif. Intell.,J. Artif. Intell. Res.,NeurIPS,ICML,ICLR,AAAI,ACL,EMNLP,NAACL,ICRA,IR", | |
# "publicationTypes": "Review", | |
"offset": 0, | |
"limit": 100, | |
} | |
headers = { | |
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36", | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", | |
"Accept-Encoding": "none", | |
"Accept-Language": "en-US,en;q=0.8", | |
"Connection": "keep-alive", | |
} | |
def check_file_exists(download_folder, pid): | |
files = glob.glob(f"{download_folder}/*.pdf") | |
for file in files: | |
filename = os.path.basename(file) | |
if filename.split("_")[-1].split(".")[0] == pid: | |
return True | |
return False | |
def check_pdf(file_path): | |
try: | |
reader = PdfReader(open(file_path, "rb")) | |
if len(reader.pages) > 0: | |
return True | |
except Exception as e: | |
logger.error(f"FAILURE: Error reading {file_path}: {e}") | |
return False | |
while True: | |
response = requests.get(base_url, params=params) | |
response_data = response.json() | |
papers = response_data["data"] | |
for paper in papers: | |
influential_citation_count = paper["influentialCitationCount"] | |
citation_count = paper["citationCount"] | |
if influential_citation_count < 1: | |
continue | |
# Create pid | |
authors_id_list = [ | |
str(author.get("authorId", "")) for author in paper["authors"] | |
] | |
authors_id_list.sort() | |
hash_object = hashlib.sha1( | |
(str(paper["year"]) + "".join(authors_id_list)).encode("utf-8") | |
) | |
pid = hash_object.hexdigest()[:6] | |
# Extract necessary fields for naming the file | |
publication_year = paper["year"] | |
citation_count = paper["citationCount"] | |
first_author_last_name = ( | |
paper["authors"][0]["name"].split()[-1] | |
if paper["authors"] | |
else "unknown_author" | |
) | |
truncated_paper_name = ( | |
paper["title"][:30].replace(" ", "_").replace("/", "_").replace("\\", "_") | |
) | |
# Form the filename | |
filename = f"{publication_year}_{citation_count}_{first_author_last_name}_{truncated_paper_name}_{pid}.pdf" | |
pdf_url = paper["openAccessPdf"]["url"] | |
file_path = os.path.join(download_folder, filename) | |
if os.path.isfile(file_path) or check_file_exists(download_folder, pid): | |
logger.debug("SKIPPING: File with pid {pid} already exists.") | |
continue | |
print("--------------------------------------------------------------") | |
print(f"# {paper['title']} ({paper['year']})") | |
print(f"Venue: {paper['venue']}") | |
print(", ".join([author["name"] for author in paper["authors"]])) | |
if paper.get("tldr"): | |
print("TL;DR: ", paper["tldr"].get("text", "No TL;DR available.")) | |
abstract = input("Download (d) / Skip (s) / Show abstract (A): ") | |
if abstract == "s": | |
continue | |
elif abstract == "d": | |
pass | |
else: | |
print("") | |
if "abstract" in paper: | |
print(paper["abstract"]) | |
else: | |
print("No abstract available.") | |
down_or_skip = input("Download (D) / Skip (s): ") | |
if down_or_skip == "s": | |
continue | |
try: | |
start_time = time.time() | |
max_time = 25 | |
pdf_response = requests.get(pdf_url, headers=headers, stream=True) | |
if pdf_response.status_code == 200: | |
with open(file_path, "wb") as file: | |
for chunk in pdf_response.iter_content(chunk_size=1024): | |
if time.time() - start_time > max_time: | |
raise Exception( | |
f"Max time exceeded for {pdf_url}. Skipping." | |
) | |
if chunk: | |
file.write(chunk) | |
if check_pdf(file_path): | |
logger.info(f"SUCCESS: copied {pdf_url} to {file_path}") | |
else: | |
logger.info(f"FAILURE: could not copy {pdf_url} to {file_path}") | |
try: | |
os.remove(file_path) | |
except: | |
pass | |
except Exception as e: | |
logger.exception( | |
f"FAILURE: Error downloading {pdf_url} to {file_path}: {e}" | |
) | |
finally: | |
time.sleep(0.25) | |
# prepare for next iteration | |
if "next" in response_data: | |
params["offset"] = response_data["next"] | |
else: | |
break # we're done | |
time.sleep(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment