nickrsan · October 18, 2018 04:37
diff --git a/get_papers.py b/get_papers.py
 import os
 from time import sleep
 from csv import DictWriter, writer
 import re

 from habanero import Crossref  # CrossRef API access

 HABANERO_USERNAME = ""  # provide an email address so they can contact you if your script misbehaves
 ISSN = ""  # ISSN of the journal to dump data for
 BASE_FOLDER = os.path.dirname(os.path.abspath(__file__))
 OUTPUT_FILE = os.path.join(BASE_FOLDER, "{}.csv".format(ISSN))  # dumps out a CSV with the ISSN as its name in the same directory
 TITLE_FREQUENCY_FILE = os.path.join(BASE_FOLDER, "{}_title_frequency.csv".format(ISSN))  # dumps out a CSV with the ISSN as its name in the same directory
 AUTHOR_FREQUENCY_FILE = os.path.join(BASE_FOLDER, "{}_author_frequency.csv".format(ISSN))  # dumps out a CSV with the ISSN as its name in the same directory
 INSTITUTION_FREQUENCY_FILE = os.path.join(BASE_FOLDER, "{}_insitution_frequency.csv".format(ISSN))  # dumps out a CSV with the ISSN as its name in the same directory
 SLEEP_TIME = 0.05  # sleep for 50 ms between requests to be nice to the CrossRef API
 PER_PAGE = 1000
 KEYS_TO_KEEP = [u'DOI', u'reference', u'issued', u'prefix', u'relation', u'author', u'reference-count', u'ISSN', u'member', u'source', u'score', u'deposited', u'indexed', u'type', u'published-online', u'URL', u'is-referenced-by-count', u'volume', u'issn-type', u'link', u'published-print', u'journal-issue', u'references-count', u'short-container-title', u'publisher', u'content-domain', u'language', u'license', u'created', u'issue', u'title', u'alternative-id', u'container-title', u'page']

 def make_data_safe(paper, keys=KEYS_TO_KEEP):
 	"""
 		Originally handled converting to string, but abandoned that for now so we can do some other analysis below.
 		Now just saves the parts we actually want to keep, does nothing else. Could be skippable, but the DictWriter
 		might complain
 	"""
 	output_dict = {}
 	for key in keys:
 		if key in paper:
 			output_dict[key] = paper[key] 
 	
 	return output_dict

 def _combine_dict_to_list(frequencies):
 	item_frequencies = []  # make a list instead of a dict
 	for item in frequencies:
 		item_frequencies.append([item, frequencies[item]])  # make it a list of lists so we can write it out with a listwriter
 		
 	return item_frequencies
 	
 def frequency_titles(papers):
 	print("Getting frequency of words in titles")
 	words = {}
 	for paper in papers:
 		title = paper['title'][0].encode('utf-8')
 		title_words = re.findall("\w+", title)
 		for word in title_words:
 			match_word = word.lower()
 			if len(match_word) > 2:
 				if match_word not in words:
 					words[match_word] = 1  # initialize it if it's not there yet
 				else:
 					words[match_word] += 1  # otherwise increment its frequency
 				
 	return _combine_dict_to_list(words)
 	
 def frequency_institutions(papers):
 	print("Getting frequency of institutions")
 	institutions = {}
 	
 	for paper in papers:
 		if not "author" in paper:
 			continue
 		
 		for author in paper["author"]:
 			for affiliation in author["affiliation"]:
 				affiliation_lower = affiliation['name'].lower().encode('utf-8')
 				affiliation_parts = affiliation_lower.split(",")
 				for part in affiliation_parts:  # try to figure out what their actual university is, not their institute, school, center, department, etc
 					if "university" in part:
 						affiliation_lower = part
 						
 				if affiliation_lower.startswith(" "):
 					affiliation_lower = affiliation_lower.replace(" ", "", 1)  # if it starts with a space, remove the first space
 						
 				if affiliation_lower not in institutions:
 					institutions[affiliation_lower] = 1
 				else:
 					institutions[affiliation_lower] += 1
 	
 	return _combine_dict_to_list(institutions)
 	
 	
 def frequency_authors(papers):
 	print("Getting frequency of authors")
 	authors = {}
 	
 	for paper in papers:
 		if not "author" in paper:
 			continue
 		
 		for author in paper['author']:
 			if "given" in author and "family" in author:
 				author_combined = u"{}{}".format(author[u'given'], author[u'family'])
 			elif "given" in author:
 				author_combined = author["given"]
 			elif "family" in author:
 				author_combined = author["family"]
 			else:
 				author_combined = ""
 				
 			author_combined = author_combined.encode('utf-8').replace(" ", "")
 			if paper['title'][0].encode('utf-8').lower().startswith("book review"):
 				author_combined +="_book_review"  # call these out separately so we know who is publishing and who is reviewing
 			
 			if author_combined not in authors:
 				authors[author_combined] = 1
 			else:
 				authors[author_combined] += 1
 	
 	return _combine_dict_to_list(authors)

 def write_frequencies(items):
 	for item in items:
 		print("Writing Frequency Info for {}".format(item["name"]))
 		with open(item["path"], 'wb') as output_file_handle:
 			csv_writer = writer(output_file_handle)
 			csv_writer.writerow(["Word", "Frequency"])
 			csv_writer.writerows(item["data"])
 	
 def get_papers(issn=ISSN, offset=0, per_page=PER_PAGE):
 	crossref_api = Crossref(mailto=HABANERO_USERNAME)
 	return crossref_api.works(filter={"issn": issn}, offset=offset, limit=per_page)  # get a first set of papers

 def get_paper_info():

 	num_papers = 0
 	collected_info = 0

 	paper_info = get_papers(ISSN, collected_info, PER_PAGE)
 	num_papers = paper_info['message'][u'total-results']

 	print("Found {} papers".format(num_papers))

 	papers = []

 	while collected_info < num_papers:
 		collected_info += PER_PAGE
 		print("Collecting {} papers".format(collected_info))
 		
 		for paper in paper_info['message']['items']:
 			if 'title' in paper:  # if it has a title in the data, we'll keep it
 				papers.append(make_data_safe(paper))
 		
 		sleep(1)
 		
 		paper_info = get_papers(ISSN, collected_info, PER_PAGE)  # get the next page
 	
 	return papers

 def write_derived_products(papers):
 	title_frequency_info = frequency_titles(papers)
 	author_frequency_info = frequency_authors(papers)
 	institution_frequency_info = frequency_institutions(papers)
 	
 	print("Writing Paper Info")
 	with open(OUTPUT_FILE, 'wb') as output_file_handle:
 		csv_writer = DictWriter(output_file_handle, fieldnames=KEYS_TO_KEEP)
 		csv_writer.writeheader()
 		csv_writer.writerows(papers)
 		
 	write_frequencies([{"name":"Title", "path":TITLE_FREQUENCY_FILE, "data": title_frequency_info},
 						{"name": "Author", "path":AUTHOR_FREQUENCY_FILE, "data": author_frequency_info},
 						{"name": "Institution", "path": INSTITUTION_FREQUENCY_FILE, "data": institution_frequency_info}])

 if __name__ == "__main__":
 	papers = get_paper_info()
 	write_derived_products(papers)
	import os
	from time import sleep
	from csv import DictWriter, writer
	import re

	from habanero import Crossref # CrossRef API access

	HABANERO_USERNAME = "" # provide an email address so they can contact you if your script misbehaves
	ISSN = "" # ISSN of the journal to dump data for
	BASE_FOLDER = os.path.dirname(os.path.abspath(__file__))
	OUTPUT_FILE = os.path.join(BASE_FOLDER, "{}.csv".format(ISSN)) # dumps out a CSV with the ISSN as its name in the same directory
	TITLE_FREQUENCY_FILE = os.path.join(BASE_FOLDER, "{}_title_frequency.csv".format(ISSN)) # dumps out a CSV with the ISSN as its name in the same directory
	AUTHOR_FREQUENCY_FILE = os.path.join(BASE_FOLDER, "{}_author_frequency.csv".format(ISSN)) # dumps out a CSV with the ISSN as its name in the same directory
	INSTITUTION_FREQUENCY_FILE = os.path.join(BASE_FOLDER, "{}_insitution_frequency.csv".format(ISSN)) # dumps out a CSV with the ISSN as its name in the same directory
	SLEEP_TIME = 0.05 # sleep for 50 ms between requests to be nice to the CrossRef API
	PER_PAGE = 1000
	KEYS_TO_KEEP = [u'DOI', u'reference', u'issued', u'prefix', u'relation', u'author', u'reference-count', u'ISSN', u'member', u'source', u'score', u'deposited', u'indexed', u'type', u'published-online', u'URL', u'is-referenced-by-count', u'volume', u'issn-type', u'link', u'published-print', u'journal-issue', u'references-count', u'short-container-title', u'publisher', u'content-domain', u'language', u'license', u'created', u'issue', u'title', u'alternative-id', u'container-title', u'page']

	def make_data_safe(paper, keys=KEYS_TO_KEEP):
	"""
	Originally handled converting to string, but abandoned that for now so we can do some other analysis below.
	Now just saves the parts we actually want to keep, does nothing else. Could be skippable, but the DictWriter
	might complain
	"""
	output_dict = {}
	for key in keys:
	if key in paper:
	output_dict[key] = paper[key]

	return output_dict

	def _combine_dict_to_list(frequencies):
	item_frequencies = [] # make a list instead of a dict
	for item in frequencies:
	item_frequencies.append([item, frequencies[item]]) # make it a list of lists so we can write it out with a listwriter

	return item_frequencies

	def frequency_titles(papers):
	print("Getting frequency of words in titles")
	words = {}
	for paper in papers:
	title = paper['title'][0].encode('utf-8')
	title_words = re.findall("\w+", title)
	for word in title_words:
	match_word = word.lower()
	if len(match_word) > 2:
	if match_word not in words:
	words[match_word] = 1 # initialize it if it's not there yet
	else:
	words[match_word] += 1 # otherwise increment its frequency

	return _combine_dict_to_list(words)

	def frequency_institutions(papers):
	print("Getting frequency of institutions")
	institutions = {}

	for paper in papers:
	if not "author" in paper:
	continue

	for author in paper["author"]:
	for affiliation in author["affiliation"]:
	affiliation_lower = affiliation['name'].lower().encode('utf-8')
	affiliation_parts = affiliation_lower.split(",")
	for part in affiliation_parts: # try to figure out what their actual university is, not their institute, school, center, department, etc
	if "university" in part:
	affiliation_lower = part

	if affiliation_lower.startswith(" "):
	affiliation_lower = affiliation_lower.replace(" ", "", 1) # if it starts with a space, remove the first space

	if affiliation_lower not in institutions:
	institutions[affiliation_lower] = 1
	else:
	institutions[affiliation_lower] += 1

	return _combine_dict_to_list(institutions)


	def frequency_authors(papers):
	print("Getting frequency of authors")
	authors = {}

	for paper in papers:
	if not "author" in paper:
	continue

	for author in paper['author']:
	if "given" in author and "family" in author:
	author_combined = u"{}{}".format(author[u'given'], author[u'family'])
	elif "given" in author:
	author_combined = author["given"]
	elif "family" in author:
	author_combined = author["family"]
	else:
	author_combined = ""

	author_combined = author_combined.encode('utf-8').replace(" ", "")
	if paper['title'][0].encode('utf-8').lower().startswith("book review"):
	author_combined +="_book_review" # call these out separately so we know who is publishing and who is reviewing

	if author_combined not in authors:
	authors[author_combined] = 1
	else:
	authors[author_combined] += 1

	return _combine_dict_to_list(authors)

	def write_frequencies(items):
	for item in items:
	print("Writing Frequency Info for {}".format(item["name"]))
	with open(item["path"], 'wb') as output_file_handle:
	csv_writer = writer(output_file_handle)
	csv_writer.writerow(["Word", "Frequency"])
	csv_writer.writerows(item["data"])

	def get_papers(issn=ISSN, offset=0, per_page=PER_PAGE):
	crossref_api = Crossref(mailto=HABANERO_USERNAME)
	return crossref_api.works(filter={"issn": issn}, offset=offset, limit=per_page) # get a first set of papers

	def get_paper_info():

	num_papers = 0
	collected_info = 0

	paper_info = get_papers(ISSN, collected_info, PER_PAGE)
	num_papers = paper_info['message'][u'total-results']

	print("Found {} papers".format(num_papers))

	papers = []

	while collected_info < num_papers:
	collected_info += PER_PAGE
	print("Collecting {} papers".format(collected_info))

	for paper in paper_info['message']['items']:
	if 'title' in paper: # if it has a title in the data, we'll keep it
	papers.append(make_data_safe(paper))

	sleep(1)

	paper_info = get_papers(ISSN, collected_info, PER_PAGE) # get the next page

	return papers

	def write_derived_products(papers):
	title_frequency_info = frequency_titles(papers)
	author_frequency_info = frequency_authors(papers)
	institution_frequency_info = frequency_institutions(papers)

	print("Writing Paper Info")
	with open(OUTPUT_FILE, 'wb') as output_file_handle:
	csv_writer = DictWriter(output_file_handle, fieldnames=KEYS_TO_KEEP)
	csv_writer.writeheader()
	csv_writer.writerows(papers)

	write_frequencies([{"name":"Title", "path":TITLE_FREQUENCY_FILE, "data": title_frequency_info},
	{"name": "Author", "path":AUTHOR_FREQUENCY_FILE, "data": author_frequency_info},
	{"name": "Institution", "path": INSTITUTION_FREQUENCY_FILE, "data": institution_frequency_info}])

	if __name__ == "__main__":
	papers = get_paper_info()
	write_derived_products(papers)