Last active
September 1, 2019 22:31
-
-
Save ahwagner/857d40b4416bcf7254ce6b366aaaaaac to your computer and use it in GitHub Desktop.
A Python method for extracting the date from a PubMed article into a Pandas datetime object
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import datetime | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
__author__ = "Alex H. Wagner" | |
def pandas_datetime_from_pmid(pmid): | |
"""Returns a pandas datetime object corresponding to the NCBI reported publication date for a PubMed ID (pmid)""" | |
resp = requests.get('https://www.ncbi.nlm.nih.gov/pubmed/{0}?report=xml'.format(pmid)) | |
resp.raise_for_status() | |
soup = BeautifulSoup(resp.content, "xml") | |
xml = soup.find('pre').text | |
better_soup = BeautifulSoup(xml, "xml") | |
date = better_soup.PubDate | |
if date: | |
month = datetime.datetime.strptime(date.Month.text, '%b').month | |
year = int(date.Year.text) | |
if date.Day: | |
day = int(date.Day.text) | |
else: | |
day = 1 | |
return pd.datetime(year, month, day) | |
else: | |
return pd.NaT | |
if __name__ == '__main__': | |
pmid = 26531824 | |
published = pandas_datetime_from_pmid(pmid) | |
print("DGIdb (PMID: {}) was published on {}.".format(pmid, published.date())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment