{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'rdkit'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn [1], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mrdkit\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Chem\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mrdkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mChem\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AllChem\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# from rdkit.Chem import Draw\u001b[39;00m\n", "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'rdkit'" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "from rdkit import Chem\n", "from rdkit.Chem import AllChem\n", "# from rdkit.Chem import Draw\n", "from rdkit.Chem import rdChemReactions as Reactions\n", "\n", "import tensorflow as tf\n", "from tensorflow import keras\n", "from keras.preprocessing import sequence\n", "from keras.utils import pad_sequences\n", "import keras\n", "from keras import backend as K\n", "from keras.models import load_model\n", "import argparse\n", "import h5py\n", "import pdb\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "seq_rdic = ['A', 'I', 'L', 'V', 'F', 'W', 'Y', 'N', 'C', 'Q', 'M','S', 'T', 'D', 'E', 'R', 'H', 'K', 'G', 'P', 'O', 'U', 'X', 'B', 'Z']\n", "seq_dic = {w: i+1 for i, w in enumerate(seq_rdic)}\n", "\n", "\n", "def encodeSeq(seq, seq_dic):\n", " if pd.isnull(seq):\n", " return [0]\n", " else:\n", " return [seq_dic[aa] for aa in seq]\n", "\n", "\n", "def load_modelfile(model_string):\n", "\tloaded_model = tf.keras.models.load_model(model_string)\n", "\treturn loaded_model\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'load_modelfile' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn [4], line 80\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m prediction_vals[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 75\u001b[0m \u001b[38;5;66;03m# loaded_model = load_modelfile('./../CNN_results/model_final.model')\u001b[39;00m\n\u001b[1;32m 76\u001b[0m \n\u001b[1;32m 77\u001b[0m \u001b[38;5;66;03m# KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')\u001b[39;00m\n\u001b[1;32m 78\u001b[0m \u001b[38;5;66;03m# kegg_df = KEGG_compound_read.reset_index()\u001b[39;00m\n\u001b[0;32m---> 80\u001b[0m loaded_model \u001b[38;5;241m=\u001b[39m load_modelfile(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m./../CNN_results_split_final/Final_model.model\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 81\u001b[0m KEGG_compound_read \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m./../CNN_data/Final_test/kegg_compound.csv\u001b[39m\u001b[38;5;124m'\u001b[39m, index_col \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mCompound_ID\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 82\u001b[0m kegg_df \u001b[38;5;241m=\u001b[39m KEGG_compound_read\u001b[38;5;241m.\u001b[39mreset_index()\n", "\u001b[0;31mNameError\u001b[0m: name 'load_modelfile' is not defined" ] } ], "source": [ "\n", "def prot_feature_gen_from_str_input(prot_input_str, prot_len=2500):\n", " Prot_ID = prot_input_str.split(':')[0]\n", " Prot_seq = prot_input_str.split(':')[1]\n", " prot_dataframe = pd.DataFrame(\n", " {'Protein_ID': Prot_ID, 'Sequence': Prot_seq}, index=[0])\n", " prot_dataframe.set_index('Protein_ID')\n", "\n", " prot_dataframe[\"encoded_sequence\"] = prot_dataframe.Sequence.map(\n", " lambda a: encodeSeq(a, seq_dic))\n", " prot_feature = pad_sequences(\n", " prot_dataframe[\"encoded_sequence\"].values, prot_len)\n", "\n", " return prot_feature, Prot_ID\n", "\n", "\n", "def mol_feature_gen_from_str_input(mol_str, kegg_id_flag, kegg_df):\n", "\n", "\tif kegg_id_flag == 1:\n", "\t\tKEGG_ID = mol_str\n", "\t\tkegg_id_loc = kegg_df.index[kegg_df.Compound_ID == KEGG_ID][0]\n", "\t\tKEGG_ID_info = kegg_df.loc[kegg_id_loc]\n", "\t\tKEGG_ID_info_df = KEGG_ID_info.to_frame().T.set_index('Compound_ID')\n", "\n", "\t\tfinal_return = KEGG_ID_info_df\n", "\t\tfinal_id = KEGG_ID\n", "\n", "\telse:\n", "\t\ttry:\n", "\t\t\tmol_ID = mol_str.split(':')[0]\n", "\t\t\tmol_smiles = mol_str.split(':')[1]\n", "\t\t\tmol = Chem.MolFromSmiles(mol_smiles)\n", "\t\t\tfp1 = AllChem.GetMorganFingerprintAsBitVect(\n", "\t\t\t mol, useChirality=True, radius=2, nBits=2048)\n", "\t\t\tfp_list = list(np.array(fp1).astype(float))\n", "\t\t\tfp_str = list(map(str, fp_list))\n", "\t\t\tmol_fp = '\\t'.join(fp_str)\n", "\n", "\t\t\tmol_dict = {}\n", "\t\t\tmol_dict['Compound_ID'] = mol_ID\n", "\t\t\tmol_dict['Smiles'] = mol_smiles\n", "\t\t\tmol_dict['morgan_fp_r2'] = mol_fp\n", "\n", "\t\t\tmol_info_df = pd.DataFrame(mol_dict, index=[0])\n", "\t\t\tmol_info_df.set_index('Compound_ID')\n", "\n", "\t\t\tfinal_return = mol_info_df\n", "\t\t\tfinal_id = mol_ID\n", "\n", "\t\texcept Exception as error:\n", "\t\t\tprint('Something wrong with molecule input string...' + repr(error))\n", "\n", "\treturn final_return, final_id\n", "\n", "\n", "def act_df_gen_mol_feature(mol_id, prot_id):\n", "\tact_df = pd.DataFrame(\n", "\t {'Protein_ID': prot_id, 'Compound_ID': mol_id}, index=[0])\n", "\n", "\treturn act_df\n", "\n", "\n", "def compound_feature_gen_df_input(act_df, comp_df, comp_len=2048, comp_vec='morgan_fp_r2'):\n", "\tact_df = pd.merge(act_df, comp_df, left_on='Compound_ID', right_index=True)\n", "\tcomp_feature = np.stack(act_df[comp_vec].map(lambda fp: fp.split(\"\\t\")))\n", "\tcomp_feature = comp_feature.astype('float')\n", "\treturn comp_feature\n", "\n", "\n", "def model_prediction(compound_feature, enz_feature, model):\n", " prediction_vals = model.predict([compound_feature, enz_feature])\n", "\n", " return prediction_vals[0][0]\n", "\n", "\n", "# loaded_model = load_modelfile('./../CNN_results/model_final.model')\n", "\n", "# KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')\n", "# kegg_df = KEGG_compound_read.reset_index()\n", "\n", "loaded_model = load_modelfile('./../CNN_results_split_final/Final_model.model')\n", "KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')\n", "kegg_df = KEGG_compound_read.reset_index()\n", "\n", "\n", "# def img_to_bytes(img_path):\n", "# img_bytes = Path(img_path).read_bytes()\n", "# encoded = base64.b64encode(img_bytes).decode()\n", "# return encoded\n", "# # st.title('dGPredictor')\n", "\n", "# header_html = \"\"\n", "\n", "# st.markdown(\n", "# header_html, unsafe_allow_html=True,\n", "# )\n", "\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Error somewhere...NameError(\"name 'prot_feature_gen_from_str_input' is not defined\")\n" ] }, { "ename": "NameError", "evalue": "name 'compound_feature1' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn [3], line 16\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mError somewhere...\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mrepr\u001b[39m(e))\n\u001b[0;32m---> 16\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28mtype\u001b[39m(compound_feature1))\n", "\u001b[0;31mNameError\u001b[0m: name 'compound_feature1' is not defined" ] } ], "source": [ "\n", "enz_str =\"A0A4P8WFA8:MTKRVLVTGGAGFLGSHLCERLLSEGHEVICLDNFGSGRRKNIKEFEDHPSFKVNDRDVRISESLPSVDRIYHLASRASPADFTQFPVNIALANTQGTRRLLDQARACDARMVFASTSEVYGDPKVHPQPETYTGNVNIRGARGCYDESKRFGETLTVAYQRKYDVDARTVRIFNTYGPRMRPDDGRVVPTFVTQALRGDDLTIYGDGEQTRSFCYVDDLIEGLISLMRVDNPEHNVYNIGKENERTIKELAYEVLGLTDTESDIVYEPLPEDDPGQRRPDITRAKTELDWEPKISLREGLEDTITYFDN\"\n", "\n", "comp_str = 'C00149:O[C@@H](CC([O-])=O)C([O-])=O'\n", "try:\n", " prot_feature, prot_id = prot_feature_gen_from_str_input(enz_str)\n", " kegg_id_flag = 0\n", " comp_feature, comp_id = mol_feature_gen_from_str_input(comp_str, kegg_id_flag, kegg_df)\n", "\n", " act_dataframe = act_df_gen_mol_feature(comp_id, prot_id)\n", " # pdb.set_trace()\n", " compound_feature1 = compound_feature_gen_df_input(act_dataframe, comp_feature)\n", "\n", "except Exception as e:\n", " print('Error somewhere...' + repr(e))\n", "\n", "print(type(compound_feature1))\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1/1 [==============================] - 0s 223ms/step\n" ] } ], "source": [ "\n", "EnzRankScore = model_prediction(compound_feature1, prot_feature, loaded_model)\n", "es = EnzRankScore" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9315796" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "es" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 4 }