vuu10 commited on
Commit
5a085f6
1 Parent(s): 85e9a63

Upload 5 files

Browse files
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ Streamlit/streamlit-main-2022-10-10-17-10-73.webm filter=lfs diff=lfs merge=lfs -text
Streamlit/Untitled.ipynb ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "ename": "ModuleNotFoundError",
10
+ "evalue": "No module named 'rdkit'",
11
+ "output_type": "error",
12
+ "traceback": [
13
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
14
+ "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
15
+ "Cell \u001b[0;32mIn [1], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mrdkit\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Chem\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mrdkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mChem\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AllChem\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# from rdkit.Chem import Draw\u001b[39;00m\n",
16
+ "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'rdkit'"
17
+ ]
18
+ }
19
+ ],
20
+ "source": [
21
+ "import pandas as pd\n",
22
+ "import numpy as np\n",
23
+ "\n",
24
+ "from rdkit import Chem\n",
25
+ "from rdkit.Chem import AllChem\n",
26
+ "# from rdkit.Chem import Draw\n",
27
+ "from rdkit.Chem import rdChemReactions as Reactions\n",
28
+ "\n",
29
+ "import tensorflow as tf\n",
30
+ "from tensorflow import keras\n",
31
+ "from keras.preprocessing import sequence\n",
32
+ "from keras.utils import pad_sequences\n",
33
+ "import keras\n",
34
+ "from keras import backend as K\n",
35
+ "from keras.models import load_model\n",
36
+ "import argparse\n",
37
+ "import h5py\n",
38
+ "import pdb\n"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": null,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "\n",
48
+ "seq_rdic = ['A', 'I', 'L', 'V', 'F', 'W', 'Y', 'N', 'C', 'Q', 'M','S', 'T', 'D', 'E', 'R', 'H', 'K', 'G', 'P', 'O', 'U', 'X', 'B', 'Z']\n",
49
+ "seq_dic = {w: i+1 for i, w in enumerate(seq_rdic)}\n",
50
+ "\n",
51
+ "\n",
52
+ "def encodeSeq(seq, seq_dic):\n",
53
+ " if pd.isnull(seq):\n",
54
+ " return [0]\n",
55
+ " else:\n",
56
+ " return [seq_dic[aa] for aa in seq]\n",
57
+ "\n",
58
+ "\n",
59
+ "def load_modelfile(model_string):\n",
60
+ "\tloaded_model = tf.keras.models.load_model(model_string)\n",
61
+ "\treturn loaded_model\n"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": 4,
67
+ "metadata": {},
68
+ "outputs": [
69
+ {
70
+ "ename": "NameError",
71
+ "evalue": "name 'load_modelfile' is not defined",
72
+ "output_type": "error",
73
+ "traceback": [
74
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
75
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
76
+ "Cell \u001b[0;32mIn [4], line 80\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m prediction_vals[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 75\u001b[0m \u001b[38;5;66;03m# loaded_model = load_modelfile('./../CNN_results/model_final.model')\u001b[39;00m\n\u001b[1;32m 76\u001b[0m \n\u001b[1;32m 77\u001b[0m \u001b[38;5;66;03m# KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')\u001b[39;00m\n\u001b[1;32m 78\u001b[0m \u001b[38;5;66;03m# kegg_df = KEGG_compound_read.reset_index()\u001b[39;00m\n\u001b[0;32m---> 80\u001b[0m loaded_model \u001b[38;5;241m=\u001b[39m load_modelfile(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m./../CNN_results_split_final/Final_model.model\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 81\u001b[0m KEGG_compound_read \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m./../CNN_data/Final_test/kegg_compound.csv\u001b[39m\u001b[38;5;124m'\u001b[39m, index_col \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mCompound_ID\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 82\u001b[0m kegg_df \u001b[38;5;241m=\u001b[39m KEGG_compound_read\u001b[38;5;241m.\u001b[39mreset_index()\n",
77
+ "\u001b[0;31mNameError\u001b[0m: name 'load_modelfile' is not defined"
78
+ ]
79
+ }
80
+ ],
81
+ "source": [
82
+ "\n",
83
+ "def prot_feature_gen_from_str_input(prot_input_str, prot_len=2500):\n",
84
+ " Prot_ID = prot_input_str.split(':')[0]\n",
85
+ " Prot_seq = prot_input_str.split(':')[1]\n",
86
+ " prot_dataframe = pd.DataFrame(\n",
87
+ " {'Protein_ID': Prot_ID, 'Sequence': Prot_seq}, index=[0])\n",
88
+ " prot_dataframe.set_index('Protein_ID')\n",
89
+ "\n",
90
+ " prot_dataframe[\"encoded_sequence\"] = prot_dataframe.Sequence.map(\n",
91
+ " lambda a: encodeSeq(a, seq_dic))\n",
92
+ " prot_feature = pad_sequences(\n",
93
+ " prot_dataframe[\"encoded_sequence\"].values, prot_len)\n",
94
+ "\n",
95
+ " return prot_feature, Prot_ID\n",
96
+ "\n",
97
+ "\n",
98
+ "def mol_feature_gen_from_str_input(mol_str, kegg_id_flag, kegg_df):\n",
99
+ "\n",
100
+ "\tif kegg_id_flag == 1:\n",
101
+ "\t\tKEGG_ID = mol_str\n",
102
+ "\t\tkegg_id_loc = kegg_df.index[kegg_df.Compound_ID == KEGG_ID][0]\n",
103
+ "\t\tKEGG_ID_info = kegg_df.loc[kegg_id_loc]\n",
104
+ "\t\tKEGG_ID_info_df = KEGG_ID_info.to_frame().T.set_index('Compound_ID')\n",
105
+ "\n",
106
+ "\t\tfinal_return = KEGG_ID_info_df\n",
107
+ "\t\tfinal_id = KEGG_ID\n",
108
+ "\n",
109
+ "\telse:\n",
110
+ "\t\ttry:\n",
111
+ "\t\t\tmol_ID = mol_str.split(':')[0]\n",
112
+ "\t\t\tmol_smiles = mol_str.split(':')[1]\n",
113
+ "\t\t\tmol = Chem.MolFromSmiles(mol_smiles)\n",
114
+ "\t\t\tfp1 = AllChem.GetMorganFingerprintAsBitVect(\n",
115
+ "\t\t\t mol, useChirality=True, radius=2, nBits=2048)\n",
116
+ "\t\t\tfp_list = list(np.array(fp1).astype(float))\n",
117
+ "\t\t\tfp_str = list(map(str, fp_list))\n",
118
+ "\t\t\tmol_fp = '\\t'.join(fp_str)\n",
119
+ "\n",
120
+ "\t\t\tmol_dict = {}\n",
121
+ "\t\t\tmol_dict['Compound_ID'] = mol_ID\n",
122
+ "\t\t\tmol_dict['Smiles'] = mol_smiles\n",
123
+ "\t\t\tmol_dict['morgan_fp_r2'] = mol_fp\n",
124
+ "\n",
125
+ "\t\t\tmol_info_df = pd.DataFrame(mol_dict, index=[0])\n",
126
+ "\t\t\tmol_info_df.set_index('Compound_ID')\n",
127
+ "\n",
128
+ "\t\t\tfinal_return = mol_info_df\n",
129
+ "\t\t\tfinal_id = mol_ID\n",
130
+ "\n",
131
+ "\t\texcept Exception as error:\n",
132
+ "\t\t\tprint('Something wrong with molecule input string...' + repr(error))\n",
133
+ "\n",
134
+ "\treturn final_return, final_id\n",
135
+ "\n",
136
+ "\n",
137
+ "def act_df_gen_mol_feature(mol_id, prot_id):\n",
138
+ "\tact_df = pd.DataFrame(\n",
139
+ "\t {'Protein_ID': prot_id, 'Compound_ID': mol_id}, index=[0])\n",
140
+ "\n",
141
+ "\treturn act_df\n",
142
+ "\n",
143
+ "\n",
144
+ "def compound_feature_gen_df_input(act_df, comp_df, comp_len=2048, comp_vec='morgan_fp_r2'):\n",
145
+ "\tact_df = pd.merge(act_df, comp_df, left_on='Compound_ID', right_index=True)\n",
146
+ "\tcomp_feature = np.stack(act_df[comp_vec].map(lambda fp: fp.split(\"\\t\")))\n",
147
+ "\tcomp_feature = comp_feature.astype('float')\n",
148
+ "\treturn comp_feature\n",
149
+ "\n",
150
+ "\n",
151
+ "def model_prediction(compound_feature, enz_feature, model):\n",
152
+ " prediction_vals = model.predict([compound_feature, enz_feature])\n",
153
+ "\n",
154
+ " return prediction_vals[0][0]\n",
155
+ "\n",
156
+ "\n",
157
+ "# loaded_model = load_modelfile('./../CNN_results/model_final.model')\n",
158
+ "\n",
159
+ "# KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')\n",
160
+ "# kegg_df = KEGG_compound_read.reset_index()\n",
161
+ "\n",
162
+ "loaded_model = load_modelfile('./../CNN_results_split_final/Final_model.model')\n",
163
+ "KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')\n",
164
+ "kegg_df = KEGG_compound_read.reset_index()\n",
165
+ "\n",
166
+ "\n",
167
+ "# def img_to_bytes(img_path):\n",
168
+ "# img_bytes = Path(img_path).read_bytes()\n",
169
+ "# encoded = base64.b64encode(img_bytes).decode()\n",
170
+ "# return encoded\n",
171
+ "# # st.title('dGPredictor')\n",
172
+ "\n",
173
+ "# header_html = \"<img src='../figures/header.png'>\"\n",
174
+ "\n",
175
+ "# st.markdown(\n",
176
+ "# header_html, unsafe_allow_html=True,\n",
177
+ "# )\n",
178
+ "\n"
179
+ ]
180
+ },
181
+ {
182
+ "cell_type": "code",
183
+ "execution_count": 3,
184
+ "metadata": {},
185
+ "outputs": [
186
+ {
187
+ "name": "stdout",
188
+ "output_type": "stream",
189
+ "text": [
190
+ "Error somewhere...NameError(\"name 'prot_feature_gen_from_str_input' is not defined\")\n"
191
+ ]
192
+ },
193
+ {
194
+ "ename": "NameError",
195
+ "evalue": "name 'compound_feature1' is not defined",
196
+ "output_type": "error",
197
+ "traceback": [
198
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
199
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
200
+ "Cell \u001b[0;32mIn [3], line 16\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mError somewhere...\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mrepr\u001b[39m(e))\n\u001b[0;32m---> 16\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28mtype\u001b[39m(compound_feature1))\n",
201
+ "\u001b[0;31mNameError\u001b[0m: name 'compound_feature1' is not defined"
202
+ ]
203
+ }
204
+ ],
205
+ "source": [
206
+ "\n",
207
+ "enz_str =\"A0A4P8WFA8:MTKRVLVTGGAGFLGSHLCERLLSEGHEVICLDNFGSGRRKNIKEFEDHPSFKVNDRDVRISESLPSVDRIYHLASRASPADFTQFPVNIALANTQGTRRLLDQARACDARMVFASTSEVYGDPKVHPQPETYTGNVNIRGARGCYDESKRFGETLTVAYQRKYDVDARTVRIFNTYGPRMRPDDGRVVPTFVTQALRGDDLTIYGDGEQTRSFCYVDDLIEGLISLMRVDNPEHNVYNIGKENERTIKELAYEVLGLTDTESDIVYEPLPEDDPGQRRPDITRAKTELDWEPKISLREGLEDTITYFDN\"\n",
208
+ "\n",
209
+ "comp_str = 'C00149:O[C@@H](CC([O-])=O)C([O-])=O'\n",
210
+ "try:\n",
211
+ " prot_feature, prot_id = prot_feature_gen_from_str_input(enz_str)\n",
212
+ " kegg_id_flag = 0\n",
213
+ " comp_feature, comp_id = mol_feature_gen_from_str_input(comp_str, kegg_id_flag, kegg_df)\n",
214
+ "\n",
215
+ " act_dataframe = act_df_gen_mol_feature(comp_id, prot_id)\n",
216
+ " # pdb.set_trace()\n",
217
+ " compound_feature1 = compound_feature_gen_df_input(act_dataframe, comp_feature)\n",
218
+ "\n",
219
+ "except Exception as e:\n",
220
+ " print('Error somewhere...' + repr(e))\n",
221
+ "\n",
222
+ "print(type(compound_feature1))\n"
223
+ ]
224
+ },
225
+ {
226
+ "cell_type": "code",
227
+ "execution_count": 11,
228
+ "metadata": {},
229
+ "outputs": [
230
+ {
231
+ "name": "stdout",
232
+ "output_type": "stream",
233
+ "text": [
234
+ "1/1 [==============================] - 0s 223ms/step\n"
235
+ ]
236
+ }
237
+ ],
238
+ "source": [
239
+ "\n",
240
+ "EnzRankScore = model_prediction(compound_feature1, prot_feature, loaded_model)\n",
241
+ "es = EnzRankScore"
242
+ ]
243
+ },
244
+ {
245
+ "cell_type": "code",
246
+ "execution_count": 12,
247
+ "metadata": {},
248
+ "outputs": [
249
+ {
250
+ "data": {
251
+ "text/plain": [
252
+ "0.9315796"
253
+ ]
254
+ },
255
+ "execution_count": 12,
256
+ "metadata": {},
257
+ "output_type": "execute_result"
258
+ }
259
+ ],
260
+ "source": [
261
+ "es"
262
+ ]
263
+ },
264
+ {
265
+ "cell_type": "code",
266
+ "execution_count": null,
267
+ "metadata": {},
268
+ "outputs": [],
269
+ "source": []
270
+ }
271
+ ],
272
+ "metadata": {
273
+ "kernelspec": {
274
+ "display_name": "Python 3 (ipykernel)",
275
+ "language": "python",
276
+ "name": "python3"
277
+ },
278
+ "language_info": {
279
+ "codemirror_mode": {
280
+ "name": "ipython",
281
+ "version": 3
282
+ },
283
+ "file_extension": ".py",
284
+ "mimetype": "text/x-python",
285
+ "name": "python",
286
+ "nbconvert_exporter": "python",
287
+ "pygments_lexer": "ipython3",
288
+ "version": "3.8.8"
289
+ }
290
+ },
291
+ "nbformat": 4,
292
+ "nbformat_minor": 4
293
+ }
Streamlit/header.png ADDED
Streamlit/main.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+ from PIL import Image
6
+ import webbrowser
7
+
8
+ from rdkit import Chem
9
+ from rdkit.Chem import AllChem
10
+ from rdkit.Chem import Draw
11
+ from rdkit.Chem import rdChemReactions as Reactions
12
+
13
+ import tensorflow as tf
14
+ from tensorflow import keras
15
+ from keras.preprocessing import sequence
16
+ from keras.utils import pad_sequences
17
+ import keras
18
+ from keras import backend as K
19
+ from keras.models import load_model
20
+ import argparse
21
+ import h5py
22
+ import pdb
23
+
24
+
25
+ seq_rdic = ['A', 'I', 'L', 'V', 'F', 'W', 'Y', 'N', 'C', 'Q', 'M',
26
+ 'S', 'T', 'D', 'E', 'R', 'H', 'K', 'G', 'P', 'O', 'U', 'X', 'B', 'Z']
27
+ seq_dic = {w: i+1 for i, w in enumerate(seq_rdic)}
28
+
29
+
30
+ @st.cache(allow_output_mutation=True)
31
+ def encodeSeq(seq, seq_dic):
32
+ if pd.isnull(seq):
33
+ return [0]
34
+ else:
35
+ return [seq_dic[aa] for aa in seq]
36
+
37
+
38
+ @st.cache(allow_output_mutation=True)
39
+ def load_modelfile(model_string):
40
+ loaded_model = tf.keras.models.load_model(model_string)
41
+ return loaded_model
42
+
43
+
44
+ @st.cache(allow_output_mutation=True)
45
+ def prot_feature_gen_from_str_input(prot_input_str, prot_len=2500):
46
+ Prot_ID = prot_input_str.split(':')[0]
47
+ Prot_seq = prot_input_str.split(':')[1]
48
+ prot_dataframe = pd.DataFrame(
49
+ {'Protein_ID': Prot_ID, 'Sequence': Prot_seq}, index=[0])
50
+ prot_dataframe.set_index('Protein_ID')
51
+
52
+ prot_dataframe["encoded_sequence"] = prot_dataframe.Sequence.map(
53
+ lambda a: encodeSeq(a, seq_dic))
54
+ prot_feature = pad_sequences(
55
+ prot_dataframe["encoded_sequence"].values, prot_len)
56
+
57
+ return prot_feature, Prot_ID
58
+
59
+
60
+ @st.cache(allow_output_mutation=True)
61
+ def mol_feature_gen_from_str_input(mol_str, kegg_id_flag, kegg_df):
62
+
63
+ if kegg_id_flag == 1:
64
+ KEGG_ID = mol_str
65
+ kegg_id_loc = kegg_df.index[kegg_df.Compound_ID == KEGG_ID][0]
66
+ KEGG_ID_info = kegg_df.loc[kegg_id_loc]
67
+ KEGG_ID_info_df = KEGG_ID_info.to_frame().T.set_index('Compound_ID')
68
+
69
+ final_return = KEGG_ID_info_df
70
+ final_id = KEGG_ID
71
+
72
+ else:
73
+ try:
74
+ mol_ID = mol_str.split(':')[0]
75
+ mol_smiles = mol_str.split(':')[1]
76
+ mol = Chem.MolFromSmiles(mol_smiles)
77
+ fp1 = AllChem.GetMorganFingerprintAsBitVect(
78
+ mol, useChirality=True, radius=2, nBits=2048)
79
+ fp_list = list(np.array(fp1).astype(float))
80
+ fp_str = list(map(str, fp_list))
81
+ mol_fp = '\t'.join(fp_str)
82
+
83
+ mol_dict = {}
84
+ mol_dict['Compound_ID'] = mol_ID
85
+ mol_dict['Smiles'] = mol_smiles
86
+ mol_dict['morgan_fp_r2'] = mol_fp
87
+
88
+ mol_info_df = pd.DataFrame(mol_dict, index=[0])
89
+ mol_info_df = mol_info_df.set_index('Compound_ID')
90
+
91
+ final_return = mol_info_df
92
+ final_id = mol_ID
93
+
94
+ except Exception as error:
95
+ print('Something wrong with molecule input string...' + repr(error))
96
+
97
+ return final_return, final_id
98
+
99
+
100
+ @st.cache(allow_output_mutation=True)
101
+ def act_df_gen_mol_feature(mol_id, prot_id):
102
+ act_df = pd.DataFrame(
103
+ {'Protein_ID': prot_id, 'Compound_ID': mol_id}, index=[0])
104
+
105
+ return act_df
106
+
107
+
108
+ @st.cache(allow_output_mutation=True)
109
+ def compound_feature_gen_df_input(act_df, comp_df, comp_len=2048, comp_vec='morgan_fp_r2'):
110
+ act_df = pd.merge(act_df, comp_df, left_on='Compound_ID', right_index=True)
111
+ comp_feature = np.stack(act_df[comp_vec].map(lambda fp: fp.split("\t")))
112
+ comp_feature = comp_feature.astype('float')
113
+ return comp_feature
114
+
115
+
116
+ @st.cache(allow_output_mutation=True)
117
+ def model_prediction(compound_feature, enz_feature, model):
118
+ prediction_vals = model.predict([compound_feature, enz_feature])
119
+
120
+ return prediction_vals[0][0]
121
+
122
+
123
+ # loaded_model = load_modelfile('./../CNN_results/model_final.model')
124
+
125
+ # KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')
126
+ # kegg_df = KEGG_compound_read.reset_index()
127
+
128
+
129
+ def main():
130
+ graph = tf.compat.v1.get_default_graph()
131
+ ld_model = tf.keras.models.load_model('./../CNN_results_split_final/Final_model.model')
132
+
133
+ KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')
134
+ kegg_df = KEGG_compound_read.reset_index()
135
+
136
+
137
+ # def img_to_bytes(img_path):
138
+ # img_bytes = Path(img_path).read_bytes()
139
+ # encoded = base64.b64encode(img_bytes).decode()
140
+ # return encoded
141
+ # # st.title('dGPredictor')
142
+
143
+ # header_html = "<img src='../figures/header.png'>"
144
+
145
+ # st.markdown(
146
+ # header_html, unsafe_allow_html=True,
147
+ # )
148
+
149
+
150
+ st.image('./header.png', use_column_width=True)
151
+
152
+ st.subheader('Enzyme-Substrate Activity Predictor ')
153
+
154
+ st.subheader('Enzyme sequence')
155
+ st.caption('Please follow the input format show in the text box--> id:Sequence')
156
+
157
+ enz_str = st.text_input('', value="A0A4P8WFA8:MTKRVLVTGGAGFLGSHLCERLLSEGHEVICLDNFGSGRRKNIKEFEDHPSFKVNDRDVRISESLPSVDRIYHLASRASPADFTQFPVNIALANTQGTRRLLDQARACDARMVFASTSEVYGDPKVHPQPETYTGNVNIRGARGCYDESKRFGETLTVAYQRKYDVDARTVRIFNTYGPRMRPDDGRVVPTFVTQALRGDDLTIYGDGEQTRSFCYVDDLIEGLISLMRVDNPEHNVYNIGKENERTIKELAYEVLGLTDTESDIVYEPLPEDDPGQRRPDITRAKTELDWEPKISLREGLEDTITYFDN")
158
+
159
+ # url = 'https://www.genome.jp/dbget-bin/www_bget?rn:R00801'
160
+ # if st.button('KEformat example'):
161
+ # webbrowser.open_new_tab(url)
162
+
163
+ st.subheader('Substrate ')
164
+ st.caption('Please follow the input format show in the text box--> KEGG id or click the checkbox')
165
+
166
+ comp_str = st.text_input('', value="C00149")
167
+ if st.checkbox('If you are entering smiles string along with KEGG ID'):
168
+ add_info = st.text_area('Additional information (id: Smiles):', "C00149:O[C@@H](CC([O-])=O)C([O-])=O")
169
+ else:
170
+ add_info = ''
171
+
172
+ if st.button("Predict"):
173
+ # if session_state.button_search:
174
+ # st.subheader('Enzyme-Substrate activity score')
175
+ with st.spinner('Calculating...'):
176
+ try:
177
+ # st.write('I am inside')
178
+ prot_feature, prot_id = prot_feature_gen_from_str_input(enz_str)
179
+ if len(add_info) == 0:
180
+ kegg_id_flag = 1
181
+ comp_feature, comp_id = mol_feature_gen_from_str_input(comp_str, kegg_id_flag, kegg_df)
182
+ else:
183
+ kegg_id_flag = 0
184
+ comp_feature, comp_id = mol_feature_gen_from_str_input(add_info, kegg_id_flag, kegg_df)
185
+
186
+ act_dataframe = act_df_gen_mol_feature(comp_id, prot_id)
187
+ # st.write(act_dataframe)
188
+ compound_feature = compound_feature_gen_df_input(act_dataframe, comp_feature)
189
+ # st.write(compound_feature)
190
+
191
+ except Exception as e:
192
+ st.write('Error somewhere...' + repr(e))
193
+
194
+ # st.write(compound_feature)
195
+ # st.write(prot_feature)
196
+ # keras.backend.clear_session()
197
+
198
+ y = ld_model.predict([compound_feature, prot_feature])
199
+
200
+ subheaderstring = 'EnzRank Score for '+ prot_id + '-' + comp_id + ' pair:'
201
+ st.subheader(subheaderstring)
202
+ st.write(str(y[0][0]))
203
+
204
+ if __name__ == '__main__':
205
+ main()
Streamlit/streamlit-main-2022-10-10-17-10-73.webm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:491b9f0b51969d6171fc1daccf7de1f97a8a8b616a2d2c13afaf252bf23c753c
3
+ size 7700063
Streamlit/test_run_file.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+ from rdkit import Chem
6
+ from rdkit.Chem import AllChem
7
+ # from rdkit.Chem import Draw
8
+ from rdkit.Chem import rdChemReactions as Reactions
9
+
10
+ import tensorflow as tf
11
+ from tensorflow import keras
12
+ from keras.preprocessing import sequence
13
+ from keras.utils import pad_sequences
14
+ import keras
15
+ from keras import backend as K
16
+ from keras.models import load_model
17
+ import argparse
18
+ import h5py
19
+ import pdb
20
+
21
+
22
+ seq_rdic = ['A', 'I', 'L', 'V', 'F', 'W', 'Y', 'N', 'C', 'Q', 'M','S', 'T', 'D', 'E', 'R', 'H', 'K', 'G', 'P', 'O', 'U', 'X', 'B', 'Z']
23
+ seq_dic = {w: i+1 for i, w in enumerate(seq_rdic)}
24
+
25
+
26
+ def encodeSeq(seq, seq_dic):
27
+ if pd.isnull(seq):
28
+ return [0]
29
+ else:
30
+ return [seq_dic[aa] for aa in seq]
31
+
32
+
33
+ def load_modelfile(model_string):
34
+ loaded_model = tf.keras.models.load_model(model_string)
35
+ return loaded_model
36
+
37
+
38
+ def prot_feature_gen_from_str_input(prot_input_str, prot_len=2500):
39
+ Prot_ID = prot_input_str.split(':')[0]
40
+ Prot_seq = prot_input_str.split(':')[1]
41
+ prot_dataframe = pd.DataFrame(
42
+ {'Protein_ID': Prot_ID, 'Sequence': Prot_seq}, index=[0])
43
+ prot_dataframe.set_index('Protein_ID')
44
+
45
+ prot_dataframe["encoded_sequence"] = prot_dataframe.Sequence.map(
46
+ lambda a: encodeSeq(a, seq_dic))
47
+ prot_feature = pad_sequences(
48
+ prot_dataframe["encoded_sequence"].values, prot_len)
49
+
50
+ return prot_feature, Prot_ID
51
+
52
+
53
+ def mol_feature_gen_from_str_input(mol_str, kegg_id_flag, kegg_df):
54
+
55
+ if kegg_id_flag == 1:
56
+ KEGG_ID = mol_str
57
+ kegg_id_loc = kegg_df.index[kegg_df.Compound_ID == KEGG_ID][0]
58
+ KEGG_ID_info = kegg_df.loc[kegg_id_loc]
59
+ KEGG_ID_info_df = KEGG_ID_info.to_frame().T.set_index('Compound_ID')
60
+
61
+ final_return = KEGG_ID_info_df
62
+ final_id = KEGG_ID
63
+
64
+ else:
65
+ try:
66
+ mol_ID = mol_str.split(':')[0]
67
+ mol_smiles = mol_str.split(':')[1]
68
+ mol = Chem.MolFromSmiles(mol_smiles)
69
+ fp1 = AllChem.GetMorganFingerprintAsBitVect(
70
+ mol, useChirality=True, radius=2, nBits=2048)
71
+ fp_list = list(np.array(fp1).astype(float))
72
+ fp_str = list(map(str, fp_list))
73
+ mol_fp = '\t'.join(fp_str)
74
+
75
+ mol_dict = {}
76
+ mol_dict['Compound_ID'] = mol_ID
77
+ mol_dict['Smiles'] = mol_smiles
78
+ mol_dict['morgan_fp_r2'] = mol_fp
79
+
80
+ mol_info_df = pd.DataFrame(mol_dict, index=[0])
81
+ mol_info_df.set_index('Compound_ID')
82
+
83
+ final_return = mol_info_df
84
+ final_id = mol_ID
85
+
86
+ except Exception as error:
87
+ print('Something wrong with molecule input string...' + repr(error))
88
+
89
+ return final_return, final_id
90
+
91
+
92
+ def act_df_gen_mol_feature(mol_id, prot_id):
93
+ act_df = pd.DataFrame(
94
+ {'Protein_ID': prot_id, 'Compound_ID': mol_id}, index=[0])
95
+
96
+ return act_df
97
+
98
+
99
+ def compound_feature_gen_df_input(act_df, comp_df, comp_len=2048, comp_vec='morgan_fp_r2'):
100
+ act_df = pd.merge(act_df, comp_df, left_on='Compound_ID', right_index=True)
101
+ comp_feature = np.stack(act_df[comp_vec].map(lambda fp: fp.split("\t")))
102
+ comp_feature = comp_feature.astype('float')
103
+ return comp_feature
104
+
105
+
106
+ def model_prediction(compound_feature, enz_feature, model):
107
+ prediction_vals = model.predict([compound_feature, enz_feature])
108
+
109
+ return prediction_vals[0][0]
110
+
111
+
112
+ # loaded_model = load_modelfile('./../CNN_results/model_final.model')
113
+
114
+ # KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')
115
+ # kegg_df = KEGG_compound_read.reset_index()
116
+
117
+
118
+ def main():
119
+ loaded_model = load_modelfile('./../CNN_results_split_final/Final_model.model')
120
+ KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')
121
+ kegg_df = KEGG_compound_read.reset_index()
122
+ # print(loaded_model.summary())
123
+
124
+
125
+ # def img_to_bytes(img_path):
126
+ # img_bytes = Path(img_path).read_bytes()
127
+ # encoded = base64.b64encode(img_bytes).decode()
128
+ # return encoded
129
+ # # st.title('dGPredictor')
130
+
131
+ # header_html = "<img src='../figures/header.png'>"
132
+
133
+ # st.markdown(
134
+ # header_html, unsafe_allow_html=True,
135
+ # )
136
+
137
+
138
+ enz_str ="A0A4P8WFA8:MTKRVLVTGGAGFLGSHLCERLLSEGHEVICLDNFGSGRRKNIKEFEDHPSFKVNDRDVRISESLPSVDRIYHLASRASPADFTQFPVNIALANTQGTRRLLDQARACDARMVFASTSEVYGDPKVHPQPETYTGNVNIRGARGCYDESKRFGETLTVAYQRKYDVDARTVRIFNTYGPRMRPDDGRVVPTFVTQALRGDDLTIYGDGEQTRSFCYVDDLIEGLISLMRVDNPEHNVYNIGKENERTIKELAYEVLGLTDTESDIVYEPLPEDDPGQRRPDITRAKTELDWEPKISLREGLEDTITYFDN"
139
+
140
+ comp_str = "C00149"
141
+ try:
142
+ prot_feature, prot_id = prot_feature_gen_from_str_input(enz_str)
143
+ kegg_id_flag = 1
144
+ comp_feature, comp_id = mol_feature_gen_from_str_input(comp_str, kegg_id_flag, kegg_df)
145
+
146
+ act_dataframe = act_df_gen_mol_feature(comp_id, prot_id)
147
+ # pdb.set_trace()
148
+ compound_feature = compound_feature_gen_df_input(act_dataframe, comp_feature)
149
+
150
+ except Exception as e:
151
+ print('Error somewhere...' + repr(e))
152
+
153
+ # print(type(compound_feature1))
154
+ # print(loaded_model.predict([compound_feature1, prot_feature]))
155
+
156
+ EnzRankScore = model_prediction(compound_feature, prot_feature, loaded_model)
157
+ es = EnzRankScore
158
+
159
+ print('something has happened')
160
+ print('EnzRank score')
161
+ print(es)
162
+ # print(type(es))
163
+ # print(type(EnzRankScore))
164
+
165
+
166
+ # graph = tf.compat.v1.get_default_graph()
167
+ # with graph.as_default():
168
+ # y = loaded_model.predict([compound_feature, prot_feature])
169
+
170
+ # print('-----------')
171
+ # print(y)
172
+ # print(type(y[0][0]))
173
+ # print(y[0][0])
174
+
175
+ if __name__ == '__main__':
176
+ main()