Skip to content

Instantly share code, notes, and snippets.

@jasonrig
Created March 21, 2023 12:21
Show Gist options
  • Save jasonrig/5b4ed42c24803818a5dad52a07917cf2 to your computer and use it in GitHub Desktop.
Save jasonrig/5b4ed42c24803818a5dad52a07917cf2 to your computer and use it in GitHub Desktop.
Verify and extract abstracts from PubMed
import csv
import glob
import gzip
import hashlib
import xml.sax
import tqdm
file_list = glob.glob("./pubmed/*.xml.gz")
def verify(path, checksum_chunk_size=1024):
with open(f"{path}.md5", "rt") as f:
expected_checksum = f.read().split()[-1]
with open(path, "rb") as f:
md5_hash = hashlib.md5()
chunk = f.read(checksum_chunk_size)
while chunk:
md5_hash.update(chunk)
chunk = f.read(checksum_chunk_size)
assert expected_checksum == md5_hash.hexdigest()
class PubmedContentHandler(xml.sax.handler.ContentHandler):
def __init__(self, file_handle) -> None:
super().__init__()
self.tqdm = tqdm.tqdm()
self.csv_writer = csv.writer(file_handle)
self.reset()
def reset(self):
self.current_element = ""
self.current_pmid = ""
self.current_content = list()
def startElement(self, name, attrs):
super().startElement(name, attrs)
if name in ("PMID", "AbstractText"):
self.current_element = name
if name == "PubmedArticle":
self.reset()
def endElement(self, name):
super().endElement(name)
if name in ("PMID", "AbstractText"):
self.current_element = ""
if name == "PubmedArticle":
if self.current_pmid and self.current_content:
self.csv_writer.writerow([self.current_pmid, " ".join(self.current_content)])
self.tqdm.update()
self.reset()
def characters(self, content):
super().characters(content)
if self.current_element == "PMID":
self.current_pmid += content
elif self.current_element == "AbstractText":
self.current_content.append(" ".join(content.split()))
with open("articles.csv", "wt", newline="") as f_out:
handler = PubmedContentHandler(f_out)
for file in file_list:
verify(file)
with gzip.open(file, "rt") as f:
for line in f:
if line.startswith("<?xml"):
parser = xml.sax.make_parser()
parser.setContentHandler(handler)
parser.feed(line)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment