Spaces:

vuu10
/

EnzRank

Running

File size: 12,084 Bytes

c0a3508

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'rdkit'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "Cell \u001b[0;32mIn [1], line 4\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mrdkit\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Chem\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mrdkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mChem\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AllChem\n\u001b[1;32m      6\u001b[0m \u001b[38;5;66;03m# from rdkit.Chem import Draw\u001b[39;00m\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'rdkit'"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from rdkit import Chem\n",
    "from rdkit.Chem import AllChem\n",
    "# from rdkit.Chem import Draw\n",
    "from rdkit.Chem import rdChemReactions as Reactions\n",
    "\n",
    "import tensorflow as tf\n",
    "from tensorflow import keras\n",
    "from keras.preprocessing import sequence\n",
    "from keras.utils import pad_sequences\n",
    "import keras\n",
    "from keras import backend as K\n",
    "from keras.models import load_model\n",
    "import argparse\n",
    "import h5py\n",
    "import pdb\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "seq_rdic = ['A', 'I', 'L', 'V', 'F', 'W', 'Y', 'N', 'C', 'Q', 'M','S', 'T', 'D', 'E', 'R', 'H', 'K', 'G', 'P', 'O', 'U', 'X', 'B', 'Z']\n",
    "seq_dic = {w: i+1 for i, w in enumerate(seq_rdic)}\n",
    "\n",
    "\n",
    "def encodeSeq(seq, seq_dic):\n",
    "    if pd.isnull(seq):\n",
    "        return [0]\n",
    "    else:\n",
    "        return [seq_dic[aa] for aa in seq]\n",
    "\n",
    "\n",
    "def load_modelfile(model_string):\n",
    "\tloaded_model = tf.keras.models.load_model(model_string)\n",
    "\treturn loaded_model\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'load_modelfile' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn [4], line 80\u001b[0m\n\u001b[1;32m     72\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m prediction_vals[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m     75\u001b[0m \u001b[38;5;66;03m# loaded_model = load_modelfile('./../CNN_results/model_final.model')\u001b[39;00m\n\u001b[1;32m     76\u001b[0m \n\u001b[1;32m     77\u001b[0m \u001b[38;5;66;03m# KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')\u001b[39;00m\n\u001b[1;32m     78\u001b[0m \u001b[38;5;66;03m# kegg_df = KEGG_compound_read.reset_index()\u001b[39;00m\n\u001b[0;32m---> 80\u001b[0m loaded_model \u001b[38;5;241m=\u001b[39m load_modelfile(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m./../CNN_results_split_final/Final_model.model\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m     81\u001b[0m KEGG_compound_read \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m./../CNN_data/Final_test/kegg_compound.csv\u001b[39m\u001b[38;5;124m'\u001b[39m, index_col \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mCompound_ID\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m     82\u001b[0m kegg_df \u001b[38;5;241m=\u001b[39m KEGG_compound_read\u001b[38;5;241m.\u001b[39mreset_index()\n",
      "\u001b[0;31mNameError\u001b[0m: name 'load_modelfile' is not defined"
     ]
    }
   ],
   "source": [
    "\n",
    "def prot_feature_gen_from_str_input(prot_input_str, prot_len=2500):\n",
    "    Prot_ID = prot_input_str.split(':')[0]\n",
    "    Prot_seq = prot_input_str.split(':')[1]\n",
    "    prot_dataframe = pd.DataFrame(\n",
    "        {'Protein_ID': Prot_ID, 'Sequence': Prot_seq}, index=[0])\n",
    "    prot_dataframe.set_index('Protein_ID')\n",
    "\n",
    "    prot_dataframe[\"encoded_sequence\"] = prot_dataframe.Sequence.map(\n",
    "        lambda a: encodeSeq(a, seq_dic))\n",
    "    prot_feature = pad_sequences(\n",
    "        prot_dataframe[\"encoded_sequence\"].values, prot_len)\n",
    "\n",
    "    return prot_feature, Prot_ID\n",
    "\n",
    "\n",
    "def mol_feature_gen_from_str_input(mol_str, kegg_id_flag, kegg_df):\n",
    "\n",
    "\tif kegg_id_flag == 1:\n",
    "\t\tKEGG_ID = mol_str\n",
    "\t\tkegg_id_loc = kegg_df.index[kegg_df.Compound_ID == KEGG_ID][0]\n",
    "\t\tKEGG_ID_info = kegg_df.loc[kegg_id_loc]\n",
    "\t\tKEGG_ID_info_df = KEGG_ID_info.to_frame().T.set_index('Compound_ID')\n",
    "\n",
    "\t\tfinal_return = KEGG_ID_info_df\n",
    "\t\tfinal_id = KEGG_ID\n",
    "\n",
    "\telse:\n",
    "\t\ttry:\n",
    "\t\t\tmol_ID = mol_str.split(':')[0]\n",
    "\t\t\tmol_smiles = mol_str.split(':')[1]\n",
    "\t\t\tmol = Chem.MolFromSmiles(mol_smiles)\n",
    "\t\t\tfp1 = AllChem.GetMorganFingerprintAsBitVect(\n",
    "\t\t\t    mol, useChirality=True, radius=2, nBits=2048)\n",
    "\t\t\tfp_list = list(np.array(fp1).astype(float))\n",
    "\t\t\tfp_str = list(map(str, fp_list))\n",
    "\t\t\tmol_fp = '\\t'.join(fp_str)\n",
    "\n",
    "\t\t\tmol_dict = {}\n",
    "\t\t\tmol_dict['Compound_ID'] = mol_ID\n",
    "\t\t\tmol_dict['Smiles'] = mol_smiles\n",
    "\t\t\tmol_dict['morgan_fp_r2'] = mol_fp\n",
    "\n",
    "\t\t\tmol_info_df = pd.DataFrame(mol_dict, index=[0])\n",
    "\t\t\tmol_info_df.set_index('Compound_ID')\n",
    "\n",
    "\t\t\tfinal_return = mol_info_df\n",
    "\t\t\tfinal_id = mol_ID\n",
    "\n",
    "\t\texcept Exception as error:\n",
    "\t\t\tprint('Something wrong with molecule input string...' + repr(error))\n",
    "\n",
    "\treturn final_return, final_id\n",
    "\n",
    "\n",
    "def act_df_gen_mol_feature(mol_id, prot_id):\n",
    "\tact_df = pd.DataFrame(\n",
    "\t    {'Protein_ID': prot_id, 'Compound_ID': mol_id}, index=[0])\n",
    "\n",
    "\treturn act_df\n",
    "\n",
    "\n",
    "def compound_feature_gen_df_input(act_df, comp_df, comp_len=2048, comp_vec='morgan_fp_r2'):\n",
    "\tact_df = pd.merge(act_df, comp_df, left_on='Compound_ID', right_index=True)\n",
    "\tcomp_feature = np.stack(act_df[comp_vec].map(lambda fp: fp.split(\"\\t\")))\n",
    "\tcomp_feature = comp_feature.astype('float')\n",
    "\treturn comp_feature\n",
    "\n",
    "\n",
    "def model_prediction(compound_feature, enz_feature, model):\n",
    "    prediction_vals = model.predict([compound_feature, enz_feature])\n",
    "\n",
    "    return prediction_vals[0][0]\n",
    "\n",
    "\n",
    "# loaded_model = load_modelfile('./../CNN_results/model_final.model')\n",
    "\n",
    "# KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')\n",
    "# kegg_df = KEGG_compound_read.reset_index()\n",
    "\n",
    "loaded_model = load_modelfile('./../CNN_results_split_final/Final_model.model')\n",
    "KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')\n",
    "kegg_df = KEGG_compound_read.reset_index()\n",
    "\n",
    "\n",
    "# def img_to_bytes(img_path):\n",
    "#     img_bytes = Path(img_path).read_bytes()\n",
    "#     encoded = base64.b64encode(img_bytes).decode()\n",
    "#     return encoded\n",
    "# # st.title('dGPredictor')\n",
    "\n",
    "# header_html = \"<img src='../figures/header.png'>\"\n",
    "\n",
    "# st.markdown(\n",
    "#     header_html, unsafe_allow_html=True,\n",
    "# )\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error somewhere...NameError(\"name 'prot_feature_gen_from_str_input' is not defined\")\n"
     ]
    },
    {
     "ename": "NameError",
     "evalue": "name 'compound_feature1' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn [3], line 16\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m     14\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mError somewhere...\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mrepr\u001b[39m(e))\n\u001b[0;32m---> 16\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28mtype\u001b[39m(compound_feature1))\n",
      "\u001b[0;31mNameError\u001b[0m: name 'compound_feature1' is not defined"
     ]
    }
   ],
   "source": [
    "\n",
    "enz_str =\"A0A4P8WFA8:MTKRVLVTGGAGFLGSHLCERLLSEGHEVICLDNFGSGRRKNIKEFEDHPSFKVNDRDVRISESLPSVDRIYHLASRASPADFTQFPVNIALANTQGTRRLLDQARACDARMVFASTSEVYGDPKVHPQPETYTGNVNIRGARGCYDESKRFGETLTVAYQRKYDVDARTVRIFNTYGPRMRPDDGRVVPTFVTQALRGDDLTIYGDGEQTRSFCYVDDLIEGLISLMRVDNPEHNVYNIGKENERTIKELAYEVLGLTDTESDIVYEPLPEDDPGQRRPDITRAKTELDWEPKISLREGLEDTITYFDN\"\n",
    "\n",
    "comp_str = 'C00149:O[C@@H](CC([O-])=O)C([O-])=O'\n",
    "try:\n",
    "    prot_feature, prot_id = prot_feature_gen_from_str_input(enz_str)\n",
    "    kegg_id_flag = 0\n",
    "    comp_feature, comp_id = mol_feature_gen_from_str_input(comp_str, kegg_id_flag, kegg_df)\n",
    "\n",
    "    act_dataframe = act_df_gen_mol_feature(comp_id, prot_id)\n",
    "    # pdb.set_trace()\n",
    "    compound_feature1 = compound_feature_gen_df_input(act_dataframe, comp_feature)\n",
    "\n",
    "except Exception as e:\n",
    "    print('Error somewhere...' + repr(e))\n",
    "\n",
    "print(type(compound_feature1))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1/1 [==============================] - 0s 223ms/step\n"
     ]
    }
   ],
   "source": [
    "\n",
    "EnzRankScore = model_prediction(compound_feature1, prot_feature, loaded_model)\n",
    "es = EnzRankScore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9315796"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "es"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}