Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import re | |
from PIL import Image | |
import webbrowser | |
import json | |
import pickle | |
import sys | |
import joblib | |
import sys | |
import os | |
try: | |
from openbabel import openbabel | |
except: | |
print ("openbabel not found, Installing openbabel ") | |
os.system("conda install -c openbabel openbabel") | |
from openbabel import openbabel | |
sys.path.append('./CC/') | |
import chemaxon | |
from chemaxon import * | |
from compound import Compound | |
from compound_cacher import CompoundCacher | |
from rdkit.Chem import rdChemReactions as Reactions | |
from rdkit.Chem import Draw | |
from rdkit import Chem | |
def load_smiles(): | |
db = pd.read_csv('./data/cache_compounds_20160818.csv', | |
index_col='compound_id') | |
db_smiles = db['smiles_pH7'].to_dict() | |
return db_smiles | |
def load_molsig_rad1(): | |
molecular_signature_r1 = json.load(open('./data/decompose_vector_ac.json')) | |
return molecular_signature_r1 | |
def load_molsig_rad2(): | |
molecular_signature_r2 = json.load( | |
open('./data/decompose_vector_ac_r2_py3_indent_modified_manual.json')) | |
return molecular_signature_r2 | |
def load_model(): | |
filename = './model/M12_model_BR.pkl' | |
loaded_model = joblib.load(open(filename, 'rb')) | |
return loaded_model | |
def load_compound_cache(): | |
ccache = CompoundCacher() | |
return ccache | |
def count_substructures(radius, molecule): | |
"""Helper function for get the information of molecular signature of a | |
metabolite. The relaxed signature requires the number of each substructure | |
to construct a matrix for each molecule. | |
Parameters | |
---------- | |
radius : int | |
the radius is bond-distance that defines how many neighbor atoms should | |
be considered in a reaction center. | |
molecule : Molecule | |
a molecule object create by RDkit (e.g. Chem.MolFromInchi(inchi_code) | |
or Chem.MolToSmiles(smiles_code)) | |
Returns | |
------- | |
dict | |
dictionary of molecular signature for a molecule, | |
{smiles: molecular_signature} | |
""" | |
m = molecule | |
smi_count = dict() | |
atomList = [atom for atom in m.GetAtoms()] | |
for i in range(len(atomList)): | |
env = Chem.FindAtomEnvironmentOfRadiusN(m, radius, i) | |
atoms = set() | |
for bidx in env: | |
atoms.add(m.GetBondWithIdx(bidx).GetBeginAtomIdx()) | |
atoms.add(m.GetBondWithIdx(bidx).GetEndAtomIdx()) | |
# only one atom is in this environment, such as O in H2O | |
if len(atoms) == 0: | |
atoms = {i} | |
smi = Chem.MolFragmentToSmiles(m, atomsToUse=list(atoms), | |
bondsToUse=env, canonical=True) | |
if smi in smi_count: | |
smi_count[smi] = smi_count[smi] + 1 | |
else: | |
smi_count[smi] = 1 | |
return smi_count | |
def decompse_novel_mets_rad1(novel_smiles, radius=1): | |
decompose_vector = dict() | |
for cid, smiles_pH7 in novel_smiles.items(): | |
mol = Chem.MolFromSmiles(smiles_pH7) | |
mol = Chem.RemoveHs(mol) | |
# Chem.RemoveStereochemistry(mol) | |
smi_count = count_substructures(radius, mol) | |
decompose_vector[cid] = smi_count | |
return decompose_vector | |
def decompse_novel_mets_rad2(novel_smiles, radius=2): | |
decompose_vector = dict() | |
for cid, smiles_pH7 in novel_smiles.items(): | |
mol = Chem.MolFromSmiles(smiles_pH7) | |
mol = Chem.RemoveHs(mol) | |
# Chem.RemoveStereochemistry(mol) | |
smi_count = count_substructures(radius, mol) | |
decompose_vector[cid] = smi_count | |
return decompose_vector | |
# def parse_rule(rxn,df_rule): | |
# df = df_rule | |
# rule_df = df[rxn].to_frame() | |
# # new_df = rule_df[(rule_df.T != 0).any()] | |
# return rule_df[(rule_df.T != 0).any()] | |
def parse_reaction_formula_side(s): | |
""" | |
Parses the side formula, e.g. '2 C00001 + C00002 + 3 C00003' | |
Ignores stoichiometry. | |
Returns: | |
The set of CIDs. | |
""" | |
if s.strip() == "null": | |
return {} | |
compound_bag = {} | |
for member in re.split('\s+\+\s+', s): | |
tokens = member.split(None, 1) | |
if len(tokens) == 0: | |
continue | |
if len(tokens) == 1: | |
amount = 1 | |
key = member | |
else: | |
amount = float(tokens[0]) | |
key = tokens[1] | |
compound_bag[key] = compound_bag.get(key, 0) + amount | |
return compound_bag | |
def parse_formula(formula, arrow='<=>', rid=None): | |
""" | |
Parses a two-sided formula such as: 2 C00001 => C00002 + C00003 | |
Return: | |
The set of substrates, products and the direction of the reaction | |
""" | |
tokens = formula.split(arrow) | |
if len(tokens) < 2: | |
print(('Reaction does not contain the arrow sign (%s): %s' | |
% (arrow, formula))) | |
if len(tokens) > 2: | |
print(('Reaction contains more than one arrow sign (%s): %s' | |
% (arrow, formula))) | |
left = tokens[0].strip() | |
right = tokens[1].strip() | |
sparse_reaction = {} | |
for cid, count in parse_reaction_formula_side(left).items(): | |
sparse_reaction[cid] = sparse_reaction.get(cid, 0) - count | |
for cid, count in parse_reaction_formula_side(right).items(): | |
sparse_reaction[cid] = sparse_reaction.get(cid, 0) + count | |
return sparse_reaction | |
def draw_rxn_figure(rxn_dict, db_smiles, novel_smiles): | |
# db_smiles = load_smiles() | |
left = '' | |
right = '' | |
for met, stoic in rxn_dict.items(): | |
if met == "C00080" or met == "C00282": | |
continue # hydogen is not considered | |
if stoic > 0: | |
if met in db_smiles: | |
right = right + db_smiles[met] + '.' | |
else: | |
right = right + novel_smiles[met] + '.' | |
else: | |
if met in db_smiles: | |
left = left + db_smiles[met] + '.' | |
else: | |
left = left + novel_smiles[met] + '.' | |
smarts = left[:-1] + '>>' + right[:-1] | |
# print smarts | |
smarts = str(smarts) | |
rxn = Reactions.ReactionFromSmarts(smarts, useSmiles=True) | |
return Draw.ReactionToImage(rxn) # , subImgSize=(400, 400)) | |
# def draw_group_changes(rxn,df_rule): | |
# df = parse_rule(rxn,df_rule) | |
# group_dict = df.to_dict()[rxn] | |
# left = '' | |
# right = '' | |
# for smiles,stoic in group_dict.iteritems(): | |
# if stoic > 0: | |
# right = right + smiles + '.' | |
# else: | |
# left = left + smiles + '.' | |
# smarts = left[:-1] + '>>' + right[:-1] | |
# rxn = Reactions.ReactionFromSmarts(smarts, useSmiles=True) | |
# return Draw.ReactionToImage(rxn) | |
# def get_rxn_rule(rid): | |
# reaction_dict = json.load(open('../data/optstoic_v3_Sji_dict.json')) | |
# molecular_signature = json.load(open('../data/decompose_vector_ac.json')) | |
# molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0) | |
# all_mets = molsigna_df.columns.tolist() | |
# all_mets.append("C00080") | |
# all_mets.append("C00282") | |
# rule_df = pd.DataFrame(index=molsigna_df.index) | |
# info = reaction_dict[rid] | |
# # skip the reactions with missing metabolites | |
# mets = info.keys() | |
# flag = False | |
# for met in mets: | |
# if met not in all_mets: | |
# flag = True | |
# break | |
# if flag: | |
# return None | |
# rule_df[rid] = 0 | |
# for met, stoic in info.items(): | |
# if met == "C00080" or met == "C00282": | |
# continue # hydogen is zero | |
# rule_df[rid] += molsigna_df[met] * stoic | |
# return rule_df | |
def get_rule(rxn_dict, molsig1, molsig2, novel_decomposed1, novel_decomposed2): | |
if novel_decomposed1 != None: | |
for cid in novel_decomposed1: | |
molsig1[cid] = novel_decomposed1[cid] | |
if novel_decomposed2 != None: | |
for cid in novel_decomposed2: | |
molsig2[cid] = novel_decomposed2[cid] | |
molsigna_df1 = pd.DataFrame.from_dict(molsig1).fillna(0) | |
all_mets1 = molsigna_df1.columns.tolist() | |
all_mets1.append("C00080") | |
all_mets1.append("C00282") | |
molsigna_df2 = pd.DataFrame.from_dict(molsig2).fillna(0) | |
all_mets2 = molsigna_df2.columns.tolist() | |
all_mets2.append("C00080") | |
all_mets2.append("C00282") | |
moieties_r1 = open('./data/group_names_r1.txt') | |
moieties_r2 = open('./data/group_names_r2_py3_modified_manual.txt') | |
moie_r1 = moieties_r1.read().splitlines() | |
moie_r2 = moieties_r2.read().splitlines() | |
molsigna_df1 = molsigna_df1.reindex(moie_r1) | |
molsigna_df2 = molsigna_df2.reindex(moie_r2) | |
rule_df1 = pd.DataFrame(index=molsigna_df1.index) | |
rule_df2 = pd.DataFrame(index=molsigna_df2.index) | |
# for rid, value in reaction_dict.items(): | |
# # skip the reactions with missing metabolites | |
# mets = value.keys() | |
# flag = False | |
# for met in mets: | |
# if met not in all_mets: | |
# flag = True | |
# break | |
# if flag: continue | |
rule_df1['change'] = 0 | |
for met, stoic in rxn_dict.items(): | |
if met == "C00080" or met == "C00282": | |
continue # hydogen is zero | |
rule_df1['change'] += molsigna_df1[met] * stoic | |
rule_df2['change'] = 0 | |
for met, stoic in rxn_dict.items(): | |
if met == "C00080" or met == "C00282": | |
continue # hydogen is zero | |
rule_df2['change'] += molsigna_df2[met] * stoic | |
rule_vec1 = rule_df1.to_numpy().T | |
rule_vec2 = rule_df2.to_numpy().T | |
m1, n1 = rule_vec1.shape | |
m2, n2 = rule_vec2.shape | |
zeros1 = np.zeros((m1, 44)) | |
zeros2 = np.zeros((m2, 44)) | |
X1 = np.concatenate((rule_vec1, zeros1), 1) | |
X2 = np.concatenate((rule_vec2, zeros2), 1) | |
rule_comb = np.concatenate((X1, X2), 1) | |
# rule_df_final = {} | |
# rule_df_final['rad1'] = rule_df1 | |
# rule_df_final['rad2'] = rule_df2 | |
return rule_comb, rule_df1, rule_df2 | |
def get_ddG0(rxn_dict, pH, I, novel_mets): | |
ccache = CompoundCacher() | |
# ddG0 = get_transform_ddG0(rxn_dict, ccache, pH, I, T) | |
T = 298.15 | |
ddG0_forward = 0 | |
for compound_id, coeff in rxn_dict.items(): | |
if novel_mets != None and compound_id in novel_mets: | |
comp = novel_mets[compound_id] | |
else: | |
comp = ccache.get_compound(compound_id) | |
ddG0_forward += coeff * comp.transform_pH7(pH, I, T) | |
return ddG0_forward | |
def get_dG0(rxn_dict, rid, pH, I, loaded_model, molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2, novel_mets): | |
# rule_df = get_rxn_rule(rid) | |
rule_comb, rule_df1, rule_df2 = get_rule( | |
rxn_dict, molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2) | |
X = rule_comb | |
# X = X.reshape(1,-1) | |
# pdb.set_trace() | |
# print(np.shape(X1)) | |
# print(np.shape(X2)) | |
# print(np.shape(X)) | |
ymean, ystd = loaded_model.predict(X, return_std=True) | |
# print(ymean) | |
# print(ystd) | |
result = {} | |
# result['dG0'] = ymean[0] + get_ddG0(rxn_dict, pH, I) | |
# result['standard deviation'] = ystd[0] | |
# result_df = pd.DataFrame([result]) | |
# result_df.style.hide_index() | |
# return result_df | |
return ymean[0] + get_ddG0(rxn_dict, pH, I, novel_mets), ystd[0], rule_df1, rule_df2 | |
# return ymean[0],ystd[0] | |
def parse_novel_molecule(add_info): | |
result = {} | |
for cid, InChI in add_info.items(): | |
c = Compound.from_inchi('Test', cid, InChI) | |
result[cid] = c | |
return result | |
def parse_novel_smiles(result): | |
novel_smiles = {} | |
for cid, c in result.items(): | |
smiles = c.smiles_pH7 | |
novel_smiles[cid] = smiles | |
return novel_smiles | |
def main(): | |
# def img_to_bytes(img_path): | |
# img_bytes = Path(img_path).read_bytes() | |
# encoded = base64.b64encode(img_bytes).decode() | |
# return encoded | |
# # st.title('dGPredictor') | |
# header_html = "<img src='../figures/header.png'>" | |
# st.markdown( | |
# header_html, unsafe_allow_html=True, | |
# ) | |
db_smiles = load_smiles() | |
molsig_r1 = load_molsig_rad1() | |
molsig_r2 = load_molsig_rad2() | |
loaded_model = load_model() | |
ccache = load_compound_cache() | |
st.image('./figures/header.png', use_column_width=True) | |
st.subheader('Reaction (please use KEGG IDs)') | |
# rxn_str = st.text_input('Reaction using KEGG ids:', value='C16688 + C00001 <=> C00095 + C00092') | |
rxn_str = st.text_input( | |
'', value='C01745 + C00004 <=> N00001 + C00003 + C00001') | |
# rxn_str = st.text_input('', value='C16688 + C00001 <=> C00095 + C00092') | |
# url = 'https://www.genome.jp/dbget-bin/www_bget?rn:R00801' | |
# if st.button('KEGG format example'): | |
# webbrowser.open_new_tab(url) | |
if st.checkbox('Reaction has metabolites not in KEGG'): | |
# st.subheader('test') | |
add_info = st.text_area('Additional information (id: InChI):', | |
'{"N00001":"InChI=1S/C14H12O/c15-14-8-4-7-13(11-14)10-9-12-5-2-1-3-6-12/h1-11,15H/b10-9+"}') | |
else: | |
add_info = '{"None":"None"}' | |
# session_state = SessionState.get(name="", button_sent=False) | |
# button_search = st.button("Search") | |
# if button_search: | |
# session_state.button_search = True | |
pH = st.slider('pH', min_value=0.0, max_value=14.0, value=7.0, step=0.1) | |
I = st.slider('Ionic strength [M]', min_value=0.0, | |
max_value=0.5, value=0.1, step=0.01) | |
if st.button("Search"): | |
# if session_state.button_search: | |
st.subheader('Reaction Equation') | |
st.write(rxn_str) | |
with st.spinner('Searching...'): | |
try: | |
novel_mets = parse_novel_molecule(json.loads(add_info)) | |
novel_smiles = parse_novel_smiles(novel_mets) | |
novel_decomposed_r1 = decompse_novel_mets_rad1(novel_smiles) | |
novel_decomposed_r2 = decompse_novel_mets_rad2(novel_smiles) | |
except Exception as e: | |
novel_mets = None | |
novel_smiles = None | |
novel_decomposed_r1 = None | |
novel_decomposed_r2 = None | |
# novel_smiles = json.loads(add_info) | |
print(novel_smiles) | |
rxn_dict = parse_formula(rxn_str) | |
st.image(draw_rxn_figure(rxn_dict, db_smiles, | |
novel_smiles), use_column_width=True) | |
# st.text('Group changes:') | |
# st.write(parse_rule('R03921')) | |
# st.write(get_rxn_rule('R03921')) | |
# session_state.calculate = st.button('Start Calculate!') | |
# if session_state.calculate: | |
# if st.button('Start Calculate!'): | |
# st.text('Result:') | |
st.subheader('Thermodynamics') | |
with st.spinner('Calculating...'): | |
mu, std, rule_df1, rule_df2 = get_dG0( | |
rxn_dict, 'R00801', pH, I, loaded_model, molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2, novel_mets) | |
st.write(r"$\Delta_r G'^{o} = %.2f \pm %.2f \ kJ/mol$" % (mu, std)) | |
st.text('Group changes:') | |
st.write(rule_df1[(rule_df1.T != 0).any()]) | |
st.write(rule_df2[(rule_df2.T != 0).any()]) | |
if __name__ == '__main__': | |
main() | |