|
|
|
""" |
|
Created on Wed Jan 12 16:21:43 2022 |
|
|
|
@author: jihon |
|
""" |
|
|
|
|
|
import json |
|
import requests |
|
import numpy as np |
|
import pandas as pd |
|
from bs4 import BeautifulSoup |
|
from rdkit import Chem |
|
from rdkit.Chem import AllChem |
|
|
|
|
|
def retrieve_by_cid_list(idlist): |
|
res = [] |
|
idstring = '' |
|
for i, cid in enumerate(idlist): |
|
idstring += ',' + str(cid) |
|
if ((i%100==99) or (i==len(idlist)-1)): |
|
url = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/" + idstring[1:(len(idstring))] + "/property/MolecularFormula,InChIKey,CanonicalSMILES/JSON" |
|
soup = BeautifulSoup(requests.get(url, timeout=15).content, "html.parser") |
|
properties = json.loads(str(soup))['PropertyTable']['Properties'] |
|
properties = [[s[k] for k in s.keys()] for s in properties] |
|
res += properties |
|
idstring = '' |
|
res = pd.DataFrame(res) |
|
res.columns = ['CID','MolecularFormula','CanonicalSMILES','InChIKey'] |
|
k = np.array([('+' not in s) and ('-' not in s) and ('.' not in s) for s in res['CanonicalSMILES']]) |
|
res = res.loc[k,:] |
|
res = res.reset_index(drop = True) |
|
return res |
|
|
|
|
|
def refine_compound_list(res): |
|
keep, keys = [], [] |
|
for i in res.index: |
|
smi = res.loc[i, 'CanonicalSMILES'] |
|
formula = res.loc[i, 'MolecularFormula'] |
|
mol = Chem.MolFromSmiles(smi) |
|
if mol is None: |
|
continue |
|
formula_cal = AllChem.CalcMolFormula(mol) |
|
if formula != formula_cal: |
|
continue |
|
if '.' in smi: |
|
continue |
|
key = res.loc[i, 'InChIKey'].split('-')[0] |
|
if key not in keys: |
|
keep.append(i) |
|
keys.append(key) |
|
keep, keys = np.array(keep), np.array(keys) |
|
return res.loc[keep] |
|
|
|
|
|
def retrieve_by_exact_mass(mass, ppm = 10): |
|
min_mass = mass - mass * ppm / 10 ** 6 |
|
max_mass = mass + mass * ppm / 10 ** 6 |
|
url = '''https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pccompound&retmode=json&term={}%3A{}[ExactMass]&retmax=9999'''.format(min_mass, max_mass) |
|
try: |
|
res = requests.get(url).text |
|
except: |
|
raise ConnectionError |
|
res = json.loads(res) |
|
idlist = res['esearchresult']['idlist'] |
|
result = retrieve_by_cid_list(idlist) |
|
result = refine_compound_list(result) |
|
result = result.reset_index(drop = True) |
|
return result |
|
|
|
|
|
def retrieve_by_formula(formula, timeout=999): |
|
url = '''https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/fastformula/{}/cids/json'''.format(formula) |
|
try: |
|
res = requests.get(url).text |
|
except: |
|
raise ConnectionError |
|
res = json.loads(res) |
|
idlist = res['IdentifierList']['CID'] |
|
result = retrieve_by_cid_list(idlist) |
|
result = refine_compound_list(result) |
|
result = result.reset_index(drop = True) |
|
return result |
|
|
|
|
|
|
|
def retrieve_by_exact_mass_database(mass, database, ppm = 10): |
|
min_mass = mass - mass * ppm / 10 ** 6 |
|
max_mass = mass + mass * ppm / 10 ** 6 |
|
result = database[np.logical_and(database['Exact mass']>=min_mass, database['Exact mass']<=max_mass)] |
|
result = result[['Title', 'Formula', 'SMILES', 'InChIkey']] |
|
if len(result) == 0: |
|
return [] |
|
result.columns = ['Title', 'MolecularFormula', 'CanonicalSMILES', 'InChIKey'] |
|
result = refine_compound_list(result) |
|
result = result.reset_index(drop=True) |
|
return result |
|
|
|
|
|
def retrieve_by_formula_database(formula, database): |
|
result = database[database['Formula'] == formula] |
|
result = result[['Title', 'Formula', 'SMILES', 'InChIkey']] |
|
if len(result) == 0: |
|
return [] |
|
result.columns = ['Title', 'MolecularFormula', 'CanonicalSMILES', 'InChIKey'] |
|
result = refine_compound_list(result) |
|
result = result.reset_index(drop=True) |
|
return result |
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
pass |