Skip to content

Instantly share code, notes, and snippets.

Forked from mcs07/
Created February 28, 2019 18:24
Show Gist options
  • Save muammar/94b61088306d59a2fa478c892bd82819 to your computer and use it in GitHub Desktop.
Save muammar/94b61088306d59a2fa478c892bd82819 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Download all UV-Vis spectra available from NIST Chemistry Webbook."""
import os
import re
import requests
from bs4 import BeautifulSoup
EXACT_RE = re.compile('/cgi/cbook.cgi\?GetInChI=(.*?)$')
ID_RE = re.compile('/cgi/cbook.cgi\?ID=(.*?)&')
JDX_PATH = 'nist/jdx/'
MOL_PATH = 'nist/mol/'
def search_nist_inchi(inchi):
"""Search NIST using the specified InChI or InChIKey query and return the matching NIST ID."""
print('Searching: %s' % inchi)
response = requests.get(NIST_URL, params={'InChI': inchi, 'Units': 'SI'})
soup = BeautifulSoup(response.text)
idlink = soup.find('a', href=EXACT_RE)
if idlink:
nistid = re.match(EXACT_RE, idlink['href']).group(1)
print('Result: %s' % nistid)
return nistid
# If no match, there is a list of similar species
# if not ids:
# ids = [re.match(ID_RE, link['href']).group(1) for link in soup('a', href=ID_RE)]
def search_nist_formula(formula, allow_other=False, allow_extra=False, match_isotopes=False, exclude_ions=False, has_uv=False):
"""Search NIST using the specified formula query and return the matching NIST IDs."""
print('Searching: %s' % formula)
params = {'Formula': formula, 'Units': 'SI'}
if allow_other:
params['AllowOther'] = 'on'
if allow_extra:
params['AllowExtra'] = 'on'
if match_isotopes:
params['MatchIso'] = 'on'
if exclude_ions:
params['NoIon'] = 'on'
if has_uv:
params['cUV'] = 'on'
response = requests.get(NIST_URL, params=params)
soup = BeautifulSoup(response.text)
ids = [re.match(ID_RE, link['href']).group(1) for link in soup('a', href=ID_RE)]
print('Result: %s' % ids)
return ids
def get_jdx(nistid, stype='UVVis'):
"""Download jdx file for the specified NIST ID, unless already downloaded."""
filepath = os.path.join(JDX_PATH, '%s-%s.jdx' % (nistid, stype))
if os.path.isfile(filepath):
print('%s %s: Already exists at %s' % (nistid, stype, filepath))
print('%s %s: Downloading' % (nistid, stype))
response = requests.get(NIST_URL, params={'JCAMP': nistid, 'Type': stype, 'Index': 0})
if response.text == '##TITLE=Spectrum not found.\n##END=\n':
print('%s %s: Spectrum not found' % (nistid, stype))
print('Saving %s' % filepath)
with open(filepath, 'w') as file:
def get_mol(nistid):
"""Download mol file for the specified NIST ID, unless already downloaded."""
filepath = os.path.join(MOL_PATH, '%s.mol' % nistid)
if os.path.isfile(filepath):
print('%s: Already exists at %s' % (nistid, filepath))
print('%s: Downloading mol' % nistid)
response = requests.get(NIST_URL, params={'Str2File': nistid})
if response.text == 'NIST 12121112142D 1 1.00000 0.00000\nCopyright by the U.S. Sec. Commerce on behalf of U.S.A. All rights reserved.\n0 0 0 0 0 1 V2000\nM END\n':
print('%s: MOL not found' % nistid)
print('Saving %s' % filepath)
with open(filepath, 'w') as file:
def get_all_uvvis():
"""Search NIST for all structures with UV-Vis spectra and download a JDX file for each."""
# Each search is limited to 400 results
# So we search by formula, allowing additional elements not specified in formula: C, CC, CCC, CCCC, etc.
for i in range(1, 100):
ids = search_nist_formula('C%s' % i, allow_other=True, exclude_ions=True, has_uv=True)
print('%s spectra found' % len(ids))
for nistid in ids:
get_jdx(nistid, stype='UVVis')
if __name__ == '__main__':
#nistid = search_nist_inchi('ZYGHJZDHTFUPRJ-UHFFFAOYSA-N')
#search_nist_formula('C20', allow_other=True, exclude_ions=True, has_uv=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment