Last active
October 1, 2021 19:02
-
-
Save antonkratz/388676241155cbe093cc8eb718f633db to your computer and use it in GitHub Desktop.
From drug NAME to SMILE string
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Invoke like this: | |
```bash | |
python GetSmiles.py oli_drugs.txt | |
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pubchempy as pcp | |
import pandas as pd | |
import numpy as np | |
from collections import OrderedDict | |
drugs_df = pd.read_csv('oli_drugs.csv') | |
def queryPubChem(compounds,batchlist = None,match='name'): | |
""" | |
This function queries the PubChem database to retrieve both the isomeric smile and CID of a compound. | |
You may need to be run in batches to not overload pubchem causing query failure. You can input a previous output to continue from. | |
Inputs: | |
compounds = pandas series containing compound names for query | |
batchlist (optional) = output of a previous run | |
Output: | |
a list containing two dictionaries. First is compound name to SMILE mapping, second is CID mapping. | |
""" | |
if batchlist == None: | |
Iso_SMILE = OrderedDict() | |
Can_SMILE = OrderedDict() | |
ik = OrderedDict() | |
CID = OrderedDict() | |
else: | |
SMILE = batchlist[0] | |
CID = batchlist[1] | |
multi = 0 | |
cnt = 0 | |
print("Retrieving compound information for " +str(compounds['0'].nunique()) + " compounds...") | |
for i in compounds['0']: | |
cpd = pcp.get_compounds(i,match) | |
cnt+=1 | |
if len(cpd) > 1: | |
multi+=1 | |
if cnt in [100,500,1000,2000,5000,10000]: | |
print("processing... "+ str(cnt)) | |
if len(cpd) != 0: | |
Iso_SMILE[i] = cpd[0].isomeric_smiles | |
Can_SMILE[i] = cpd[0].canonical_smiles | |
ik[i] = cpd[0].inchikey | |
CID[i] = cpd[0].cid | |
else: | |
Iso_SMILE[i] = "" | |
Can_SMILE[i] = "" | |
ik[i] = "" | |
CID[i] = "" | |
print("##########") | |
print("COMPLETE!") | |
print("A total of " + str(multi) + " had multiple IDs matching. First matches were recorded.") | |
results = pd.DataFrame([Iso_SMILE, Can_SMILE, CID, ik]).transpose() | |
results.to_csv('oli_result.csv', sep="\t") | |
if __name__ == '__main__': | |
queryPubChem(drugs_df) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
0 | ||
---|---|---|
1 | Cisplatin | |
2 | camptothecin | |
3 | Etoposide | |
4 | HU | |
5 | IR | |
6 | Doxorubicin | |
7 | H2O2 | |
8 | MMS | |
9 | Pyridostatin | |
10 | UV | |
11 | Bleomycin | |
12 | Olaparib | |
13 | AZD6738 | |
16 | ICRF | |
17 | Formaldehyde | |
18 | PhenDC3 | |
19 | Duocarmycin | |
20 | Trabectedin | |
21 | Calicheamicin | |
22 | Gemcitabine | |
23 | illudinS | |
24 | MLN4924 | |
25 | MNNG | |
26 | KBrO3 | |
27 | CD437 | |
29 | BPDE | |
30 | HU2 | |
31 | PladB |
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
0 1 2 | |
Cisplatin N.N.Cl[Pt]Cl N.N.Cl[Pt]Cl 5702198 | |
camptothecin CC[C@@]1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=C4C3=C2)O CCC1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=C4C3=C2)O 24360 | |
Etoposide C[C@@H]1OC[C@@H]2[C@@H](O1)[C@@H]([C@H]([C@@H](O2)O[C@H]3[C@H]4COC(=O)[C@@H]4[C@@H](C5=CC6=C(C=C35)OCO6)C7=CC(=C(C(=C7)OC)O)OC)O)O CC1OCC2C(O1)C(C(C(O2)OC3C4COC(=O)C4C(C5=CC6=C(C=C35)OCO6)C7=CC(=C(C(=C7)OC)O)OC)O)O 36462 | |
HU C(=O)(N)NO C(=O)(N)NO 3657 | |
IR [Ir] [Ir] 23924 | |
Doxorubicin C[C@H]1[C@H]([C@H](C[C@@H](O1)O[C@H]2C[C@@](CC3=C2C(=C4C(=C3O)C(=O)C5=C(C4=O)C(=CC=C5)OC)O)(C(=O)CO)O)N)O CC1C(C(CC(O1)OC2CC(CC3=C2C(=C4C(=C3O)C(=O)C5=C(C4=O)C(=CC=C5)OC)O)(C(=O)CO)O)N)O 31703 | |
H2O2 OO OO 784 | |
MMS COS(=O)(=O)C COS(=O)(=O)C 4156 | |
Pyridostatin C1=CC=C2C(=C1)C(=CC(=N2)NC(=O)C3=CC(=CC(=N3)C(=O)NC4=NC5=CC=CC=C5C(=C4)OCCN)OCCN)OCCN C1=CC=C2C(=C1)C(=CC(=N2)NC(=O)C3=CC(=CC(=N3)C(=O)NC4=NC5=CC=CC=C5C(=C4)OCCN)OCCN)OCCN 25227847 | |
UV CC[C@@H](C=O)NC(=O)[C@H](CC)NC(=O)[C@H](CC)NC(=O)[C@H](CC)NC(=O)[C@H](CC)NC(=O)[C@H](CC)NC(=O)[C@H](CC)NC(=O)[C@H](CC)N CCC(C=O)NC(=O)C(CC)NC(=O)C(CC)NC(=O)C(CC)NC(=O)C(CC)NC(=O)C(CC)NC(=O)C(CC)NC(=O)C(CC)N 155487962 | |
Bleomycin CC1=C(N=C(N=C1N)[C@H](CC(=O)N)NC[C@@H](C(=O)N)N)C(=O)N[C@@H]([C@H](C2=CN=CN2)OC3C(C(C(C(O3)CO)O)O)OC4C(C(C(C(O4)CO)O)OC(=O)N)O)C(=O)N[C@H](C)[C@H]([C@H](C)C(=O)N[C@@H]([C@@H](C)O)C(=O)NCCC5=NC(=CS5)C6=NC(=CS6)C(=O)NCCC[S+](C)C)O CC1=C(N=C(N=C1N)C(CC(=O)N)NCC(C(=O)N)N)C(=O)NC(C(C2=CN=CN2)OC3C(C(C(C(O3)CO)O)O)OC4C(C(C(C(O4)CO)O)OC(=O)N)O)C(=O)NC(C)C(C(C)C(=O)NC(C(C)O)C(=O)NCCC5=NC(=CS5)C6=NC(=CS6)C(=O)NCCC[S+](C)C)O 5360373 | |
Olaparib C1CC1C(=O)N2CCN(CC2)C(=O)C3=C(C=CC(=C3)CC4=NNC(=O)C5=CC=CC=C54)F C1CC1C(=O)N2CCN(CC2)C(=O)C3=C(C=CC(=C3)CC4=NNC(=O)C5=CC=CC=C54)F 23725625 | |
AZD6738 C[C@@H]1COCCN1C2=NC(=NC(=C2)C3(CC3)[S@](=N)(=O)C)C4=C5C=CNC5=NC=C4 CC1COCCN1C2=NC(=NC(=C2)C3(CC3)S(=N)(=O)C)C4=C5C=CNC5=NC=C4 54761306 | |
ICRF | |
Formaldehyde C=O C=O 712 | |
PhenDC3 C[N+]1=CC(=CC2=CC=CC=C21)NC(=O)C3=NC4=C(C=CC5=C4N=C(C=C5)C(=O)NC6=CC7=CC=CC=C7[N+](=C6)C)C=C3.C(F)(F)(F)S(=O)(=O)[O-].C(F)(F)(F)S(=O)(=O)[O-] C[N+]1=CC(=CC2=CC=CC=C21)NC(=O)C3=NC4=C(C=CC5=C4N=C(C=C5)C(=O)NC6=CC7=CC=CC=C7[N+](=C6)C)C=C3.C(F)(F)(F)S(=O)(=O)[O-].C(F)(F)(F)S(=O)(=O)[O-] 131704505 | |
Duocarmycin | |
Trabectedin CC1=CC2=C([C@@H]3[C@@H]4[C@H]5C6=C(C(=C7C(=C6[C@@H](N4[C@H]([C@H](C2)N3C)O)COC(=O)[C@@]8(CS5)C9=CC(=C(C=C9CCN8)O)OC)OCO7)C)OC(=O)C)C(=C1OC)O CC1=CC2=C(C3C4C5C6=C(C(=C7C(=C6C(N4C(C(C2)N3C)O)COC(=O)C8(CS5)C9=CC(=C(C=C9CCN8)O)OC)OCO7)C)OC(=O)C)C(=C1OC)O 108150 | |
Calicheamicin CCN[C@H]1CO[C@H](C[C@@H]1OC)O[C@@H]2[C@H]([C@@H]([C@H](O[C@H]2O[C@H]3C#C/C=C\C#C[C@@]\4(CC(=O)C(=C3/C4=C/CSSSC)NC(=O)OC)O)C)NO[C@H]5C[C@@H]([C@@H]([C@H](O5)C)SC(=O)C6=C(C(=C(C(=C6OC)OC)O[C@H]7[C@@H]([C@@H]([C@H]([C@@H](O7)C)O)OC)O)I)C)O)O CCNC1COC(CC1OC)OC2C(C(C(OC2OC3C#CC=CC#CC4(CC(=O)C(=C3C4=CCSSSC)NC(=O)OC)O)C)NOC5CC(C(C(O5)C)SC(=O)C6=C(C(=C(C(=C6OC)OC)OC7C(C(C(C(O7)C)O)OC)O)I)C)O)O 101617325 | |
Gemcitabine C1=CN(C(=O)N=C1N)[C@H]2C([C@@H]([C@H](O2)CO)O)(F)F C1=CN(C(=O)N=C1N)C2C(C(C(O2)CO)O)(F)F 60750 | |
illudinS | |
MLN4924 C1CC2=CC=CC=C2[C@H]1NC3=C4C=CN(C4=NC=N3)[C@@H]5C[C@H]([C@H](C5)O)COS(=O)(=O)N C1CC2=CC=CC=C2C1NC3=C4C=CN(C4=NC=N3)C5CC(C(C5)O)COS(=O)(=O)N 16720766 | |
MNNG CN(C(=N)N[N+](=O)[O-])N=O CN(C(=N)N[N+](=O)[O-])N=O 135436526 | |
KBrO3 | |
CD437 C1C2CC3CC1CC(C2)(C3)C4=C(C=CC(=C4)C5=CC6=C(C=C5)C=C(C=C6)C(=O)O)O C1C2CC3CC1CC(C2)(C3)C4=C(C=CC(=C4)C5=CC6=C(C=C5)C=C(C=C6)C(=O)O)O 135411 | |
BPDE C1=CC2=C3C(=C1)C=CC4=C3C(=CC5=C4C6C(O6)C(C5O)O)C=C2 C1=CC2=C3C(=C1)C=CC4=C3C(=CC5=C4C6C(O6)C(C5O)O)C=C2 41322 | |
HU2 CCC[C@@H](C(C(=O)NCC(=O)N[C@@H](C1=CC=CC=C1)C(=O)O)(O)O)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](CCCCC(C)C)C(C)C CCCC(C(C(=O)NCC(=O)NC(C1=CC=CC=C1)C(=O)O)(O)O)NC(=O)C(CC(C)C)NC(=O)C(CCCCC(C)C)C(C)C 16741223 | |
PladB |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment