Skip to content

Instantly share code, notes, and snippets.

@dmyersturnbull
Created May 20, 2020 19:48
Show Gist options
  • Save dmyersturnbull/97c8a1c15bce086d21cf8affa04cf36c to your computer and use it in GitHub Desktop.
Save dmyersturnbull/97c8a1c15bce086d21cf8affa04cf36c to your computer and use it in GitHub Desktop.
Make my BibTeX file a little less stupid. Fix UTF, don't break in weird places, and don't list DOIs as URLs.
import sys
import re
from pathlib import Path
def fix(path: Path, accent_file: Path, output_path: Path) -> None:
"""
Reformat my bib file.
* Remove PMC and DOI URLs because they're listed as IDs anyway.
* Replace accent-like escape characters with their UTF equivalents. I mean, it's 2020.
* Remove line breaks in the middle of sections.
* TODO: Remove abstracts.
* TODO: Auto-remove duplicates
Args:
path: Path (or str) to the file to correct.
accent_file: Path (or str) to a tab-delimited file of (escape sequence \t character), with no header.
output_path: Path (or str) to write to; will overwrite if it exists.
"""
output_path = Path(output_path)
path = Path(path)
dct = {s.split('\t')[0]: s.split('\t')[1] for s in Path(accent_file).read_text(encoding='utf8').splitlines()}
text = path.read_text(encoding='utf8')
for key, value in dct.items():
text = text.replace('{'+key+'}', value).replace(key, value)
text = text.replace('PLoS', 'PLOS')
text = '\n'.join([s for s in text.splitlines() if 'pubmedcentral.nih.gov' not in s and 'dx.doi.org/' not in s])
pattern = re.compile('( *[a-z]+ *= *")([^"]+)(" *,)')
def fix(match):
inner = re.compile(r'[ \n]+').sub(' ', match.group(2))
return match.group(1) + inner + match.group(3)
text = pattern.sub(fix, text)
output_path.write_text(text, encoding='utf8')
if __name__ == '__main__':
fix(Path(sys.argv[1]), Path('accents.tsv'), Path(str(path) + '.fixed.bib'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment