|
#!/usr/bin/env python3 |
|
"""List the most frequently published-in conferences by authors publishing in <conf>. |
|
|
|
Usage: |
|
adjacent-confs.py download <conf> <year> [options] |
|
adjacent-confs.py file <dump> [options] |
|
adjacent-confs.py (-h | --help) |
|
|
|
Options: |
|
-h --help Show this screen. |
|
--dump <out> Save output dataframe to <out>. |
|
""" |
|
import requests |
|
from requests.exceptions import Timeout, RequestException |
|
from docopt import docopt |
|
import pandas as pd |
|
from time import sleep |
|
from json.decoder import JSONDecodeError |
|
|
|
def get_json(url, retry_limit = 10): |
|
tries = 0 |
|
while tries < retry_limit: |
|
tries += 1 |
|
try: |
|
r = requests.get(url) |
|
return r.json() |
|
except (Timeout, RequestException, JSONDecodeError) as e: |
|
print("Request to {} failed: {}".format(url, e)) |
|
sleep(5) |
|
raise Exception("request failed after {} tries".format(tries)) |
|
|
|
def list_authors(conf, year): |
|
FMT = "http://dblp.org/search/publ/api?q=venue:{venue}:year:{year}:&format=json&f={first}&c=0" |
|
r = get_json(FMT.format(venue=conf, year=year, first=0)) |
|
authors = set() |
|
total = int(r["result"]["hits"]["@total"]) |
|
recvd = int(r["result"]["hits"]["@sent"]) |
|
while True: |
|
print("{} of {} received".format(recvd, total)) |
|
for pub in r["result"]["hits"]["hit"]: |
|
if pub["info"]["type"] != "Editorship": |
|
try: |
|
auth = pub["info"]["authors"]["author"] |
|
if isinstance(auth, list): |
|
authors |= set(auth) |
|
elif isinstance(auth, str): |
|
authors.add(auth) |
|
else: |
|
print("Unknown authors list {} of type {}".format(auth, type(auth))) |
|
except Exception as e: |
|
print("Unable to read authors from pub {}: {}".format(pub, e)) |
|
if recvd == total: |
|
return authors |
|
else: |
|
r = get_json(FMT.format(venue=conf, year=year, first=recvd)) |
|
recvd += int(r["result"]["hits"]["@sent"]) |
|
|
|
def list_venues(author): |
|
FMT = "http://dblp.org/search/publ/api?q=author:{author}:&format=json&f={first}&c=0" |
|
r = get_json(FMT.format(author=author.replace(" ", "_"), first=0)) |
|
venues = {} |
|
total = int(r["result"]["hits"]["@total"]) |
|
recvd = int(r["result"]["hits"]["@sent"]) |
|
|
|
if total == 0: |
|
print("No publications listed for author {}".format(author)) |
|
return {} |
|
|
|
while True: |
|
print("{} of {} received".format(recvd, total)) |
|
for pub in r["result"]["hits"]["hit"]: |
|
if pub["info"]["type"] in ["Conference and Workshop Papers", "Journal Articles"]: |
|
try: |
|
venue = pub["info"]["venue"] |
|
year = pub["info"]["year"] |
|
if venue not in venues: |
|
venues[venue] = [year] |
|
else: |
|
venues[venue] += [year] |
|
except Exception as e: |
|
print("Could not get info from pub {}: {}".format(pub, e)) |
|
if recvd == total: |
|
return venues |
|
else: |
|
r = get_json(FMT.format(author=author.replace(" ", "_"), first=recvd)) |
|
recvd += int(r["result"]["hits"]["@sent"]) |
|
|
|
if __name__ == "__main__": |
|
args = docopt(__doc__, version="0.1") |
|
if args["download"]: |
|
authors = list_authors(args["<conf>"], args["<year>"]) |
|
print(authors) |
|
|
|
df = pd.DataFrame(columns=["author", "venue", "year"]) |
|
for author in authors: |
|
try: |
|
vs = list_venues(author) |
|
if len(vs) > 0: |
|
df = df.append([{"author": author, "venue": v, "year": y} for v, yrs in vs.items() for y in yrs]) |
|
except Exception as e: |
|
print("Downloading pubs by {} failed (reason: {}), skipping.".format(author, e)) |
|
|
|
print(df) |
|
if args["--dump"] is not None: |
|
df.to_csv(args["--dump"]) |