Skip to content

Instantly share code, notes, and snippets.

@almugabo
Created June 3, 2021 04:29
Show Gist options
  • Save almugabo/ee4eff46dc1165374c85f24dbef4abb9 to your computer and use it in GitHub Desktop.
Save almugabo/ee4eff46dc1165374c85f24dbef4abb9 to your computer and use it in GitHub Desktop.
extraction of references in structured form
"""
FROM: https://gist.github.com/sobolevnrm/412763ebae5424a92d3239898b615e2a
Process RIS format following the standard at",
http://referencemanager.com/sites/rm/files/m/direct_export_ris.pdf """
import re
ALLOWED_TAGS = {"TY" : "Record start",
"ER" : "Record end",
"A2" : "Secondary author",
"A3" : "Tertiary Author",
"A4" : "Subsidiary Author",
"AB" : "Abstract",
"AD" : "Author Address",
"AN" : "Accession Number",
"AU" : "Author",
"C1" : "Custom 1",
"C2" : "Custom 2",
"C3" : "Custom 3",
"C4" : "Custom 4",
"C5" : "Custom 5",
"C6" : "Custom 6",
"C7" : "Custom 7",
"C8" : "Custom 8",
"CA" : "Caption",
"CN" : "Call Number",
"CY" : "Place Published",
"DA" : "Date",
"DB" : "Name of Database",
"DO" : "DOI",
"DP" : "Database Provider",
"ET" : "Edition",
"J2" : "Alternate Title",
"KW" : "Keywords",
"L1" : "File Attachments",
"L4" : "Figure",
"LA" : "Language",
"LB" : "Label",
"IS" : "Number",
"M3" : "Type of Work",
"N1" : "Notes",
"NV" : "Number of Volumes",
"OP" : "Original Publication",
"PB" : "Publisher",
"PY" : "Year"}
REFERENCE_TYPES = {"ABST" : "Abstract",
"ADVS" : "Audiovisual material",
"ART" : "Art Work",
"BILL" : "Bill/Resolution",
"BOOK" : "Book, Whole",
"CASE" : "Case",
"CHAP" : "Book chapter",
"COMP" : "Computer program",
"CONF" : "Conference proceeding",
"CTLG" : "Catalog",
"DATA" : "Data file",
"ELEC" : "Electronic Citation",
"GEN" : "Generic",
"HEAR" : "Hearing",
"ICOMM" : "Internet Communication",
"INPR" : "In Press",
"JFULL" : "Journal (full)",
"JOUR" : "Journal",
"MAP" : "Map",
"MGZN" : "Magazine article",
"MPCT" : "Motion picture",
"MUSIC" : "Music score",
"NEWS" : "Newspaper",
"PAMP" : "Pamphlet",
"PAT" : "Patent",
"PCOMM" : "Personal communication",
"RPRT" : "Report",
"SER" : "Serial (Book, Monograph)",
"SLIDE" : "Slide",
"SOUND" : "Sound recording",
"STAT" : "Statute",
"THES" : "Thesis/Dissertation",
"UNBILl" : "Unenacted bill/resolution",
"UNPB" : "Unpublished work",
"VIDEO" : "Video recording"}
class RIS:
""" RIS file structure """
def __init__(self, in_file=None):
""" Initialize and parse input """
self.records = []
if in_file:
self.parse(in_file)
def parse(self, in_file):
""" Parse input file """
self.current_tag = None
self.current_record = None
prog = re.compile("^([A-Z][A-Z0-9]) *- *(.*)")
lines = []
# Eliminate blank lines
for line in in_file:
line = line.strip()
if len(line) > 0:
lines.append(line)
for line in lines:
match = prog.match(line)
if match:
tag = match.groups()[0]
field = match.groups()[1]
self.process_field(tag, field)
else:
raise ValueError(line)
def process_field(self, tag, field):
""" Process RIS file field """
if tag == "TY":
self.current_record = {tag: field}
elif tag == "ER":
self.records.append(self.current_record)
self.current_record = None
elif tag in ["AU", "AD", "KW", "N1"]:
if tag in self.current_record:
self.current_record[tag].append(field)
else:
self.current_record[tag] = [field]
else:
if not tag in self.current_record:
self.current_record[tag] = field
else:
error_str = "Duplicate tag: %s" % tag
raise ValueError(error_str)
import pandas as pd
with open(xFile, 'r', encoding="utf8") as ff:
#xdata = ff.read()
ris = RIS(ff)
d1 = pd.DataFrame(ris.records)
print(len(d1))
d1.head()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment