Created
May 6, 2020 07:53
-
-
Save jaklinger/548145a28d33923e7f2b0311772f7e7b to your computer and use it in GitHub Desktop.
Get papers from arxiv table, including filtering bio/med/arxiv and basic keyword filtering
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from data_getters.core import get_engine | |
def bad_tokenizer(text): | |
return x.lower().replace(".", "").split() | |
columns=['id', 'created', 'title', 'abstract', 'mag_id', 'citation_count', 'article_source'] | |
con = get_engine("/path/to/innovation-mapping-5712.config") | |
chunks = pd.read_sql_table('arxiv_articles', con, columns=columns, chunksize=1000) | |
keywords = ('covid', 'covid-19', 'coronavirus') | |
covid_df = [] | |
for i, df in enumerate(chunks): | |
covid = df.abstract.apply(lambda text: text is not None and any(term in text for term in keywords)) | |
if sum(covid) == 0: | |
continue | |
covid_df.append(df.loc[covid]) | |
covid_df = pd.concat(covid_df) | |
for source, subset_df in covid_df.groupby('article_source'): | |
print(source, len(subset_df)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment