{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b1b4e507-0ca2-4309-ac5c-0461f99edc72",
   "metadata": {},
   "source": [
    "# Phosformer-ST Example Code"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3a23dd26-2060-4cb1-a1a0-dd97b168a329",
   "metadata": {},
   "source": [
    "## imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ec3bd89c-8aa1-408c-b569-c89dc2bb768d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "import hashlib\n",
    "import warnings\n",
    "sys.dont_write_bytecode=True\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import torch\n",
    "\n",
    "from tokenization_esm import EsmTokenizer\n",
    "from modeling_esm import EsmForSequenceClassificationMHACustom\n",
    "#for versioning spesfics see ReadMe \n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a42e87fc-bd23-4b7b-8234-06cbdcb25bc0",
   "metadata": {},
   "source": [
    "## loading in pre-trained model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9def3d4b-822d-44b8-a896-3e6ee5aca13d",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_dir = 'multitask_MHA_esm2_t30_150M_UR50D_neg_ratio_8+8_shift_30_mask_0.2_2023-03-25_90'\n",
    "\n",
    "tokenizer = EsmTokenizer.from_pretrained(model_dir)\n",
    "model     = EsmForSequenceClassificationMHACustom.from_pretrained(model_dir, num_labels=2)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ff3f7f18-6cb2-4818-9583-bb729e848b81",
   "metadata": {},
   "source": [
    "## configureing paramaters of the Phos-ST model\n",
    "\n",
    "## also orginizing the data for the input into Phos-ST   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "1dbece0a-39a3-4932-8781-a679dd699587",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_model(peptides, kinases, model=model, tokenizer=tokenizer, device='cuda', batch_size=50, output_hidden_states=True, output_attentions=True):\n",
    "    torch.cuda.empty_cache()\n",
    "    \n",
    "    model.eval()\n",
    "    model = model.to(device)\n",
    "    \n",
    "    size = len(peptides)\n",
    "    breaks = set(np.cumsum([batch_size]*(size//batch_size)+[size%batch_size])-1)\n",
    "\n",
    "    pairs = []\n",
    "    for n, pair in enumerate(zip(peptides, kinases)):\n",
    "        sys.stderr.write(f'{1+n}\\r')\n",
    "        pairs += [pair]\n",
    "        if n in breaks:\n",
    "            \n",
    "            output = dict(zip(('peptide','kinase'),zip(*pairs)))\n",
    "            ids = tokenizer(pairs, padding=True, return_tensors='pt')\n",
    "            ids = ids.to(device)\n",
    "            \n",
    "            with torch.no_grad():\n",
    "                results, classifier_attn_outputs, classifier_attn_output_weights = model(ids['input_ids'], \n",
    "                                attention_mask=ids['attention_mask'], \n",
    "                                output_hidden_states=output_hidden_states, \n",
    "                                output_attentions=output_attentions)\n",
    "                \n",
    "                attention_mask = ids['attention_mask'].cpu().type(torch.bool)\n",
    "\n",
    "                output['probability'] = results['logits'].softmax(1)[:,1].cpu().numpy()\n",
    "                \n",
    "                if output_hidden_states:\n",
    "                    last_embeddings = results['hidden_states'][-1].cpu().numpy()\n",
    "                    output['embedding'] = [i[m] for i, m in zip(last_embeddings, attention_mask)]\n",
    "                    \n",
    "                if output_attentions:\n",
    "                    last_attentions = results['attentions'][-1].cpu().numpy()\n",
    "                    output['attention'] = [i[:,m,:][:,:,m] for i, m in zip(last_attentions, attention_mask)]\n",
    "                \n",
    "                classifier_attn_outputs = classifier_attn_outputs.cpu()\n",
    "                output['classifier_attn_outputs'] = classifier_attn_outputs\n",
    "\n",
    "                classifier_attn_output_weights = classifier_attn_output_weights.cpu()\n",
    "                output['classifier_attn_output_weights'] = [i[:,m[16:]] for i, m in zip(classifier_attn_output_weights, attention_mask)]\n",
    "                \n",
    "            keys = output.keys()\n",
    "            for data in zip(*(output[k] for k in keys)):\n",
    "                yield dict(zip(keys, data))\n",
    "            \n",
    "            pairs = []\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3fbfc05d-970c-4db9-bae8-61ea2ffb06af",
   "metadata": {},
   "source": [
    "## helper funtion to use Phos-ST"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "98e90bc6-28db-449c-8a0b-805ef22cd9ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "# this could be modified to take in a list of substrate and kinase domains\n",
    "# just drop the square brackets on the kinaseDomainSeq variable and substrate15mer variable around the job fuction's 1st and 2nd argument\n",
    "def phosST(kinaseDomainSeq,substrate15mer):\n",
    "    job = run_model(\n",
    "        [substrate15mer],\n",
    "        [kinaseDomainSeq],\n",
    "        model=model, \n",
    "        tokenizer=tokenizer, \n",
    "        device='cuda', \n",
    "        batch_size=10,\n",
    "        output_hidden_states=False,\n",
    "        output_attentions=False,\n",
    "    )\n",
    "    \n",
    "    #total = dataset.shape[0]\n",
    "    results = {\n",
    "        'kinase' : [],\n",
    "        'peptide' : [],\n",
    "        'prob' : [],\n",
    "    }\n",
    "\n",
    "    \n",
    "    for n, i in enumerate(job):\n",
    "        #sys.stderr.write(f'{n+1} / {total}\\r')\n",
    "        results['kinase' ] += [i['kinase']]\n",
    "        results['peptide'] += [i['peptide']]\n",
    "        results['prob'   ] += [i['probability']]\n",
    "    \n",
    "    result = pd.DataFrame(results)\n",
    "    print(\"The Predictive score is \"+str(i['probability']))\n",
    "    \n",
    "    return result\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "151c217b-1ee1-4cf7-b41f-b52f5ce22719",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ee511f8b-5d9e-4c8a-8191-bd2a7fd3a5e9",
   "metadata": {},
   "source": [
    "# Postive Example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5bd6e8ee-444e-49d5-a617-d2343759759a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# P17612 KAPCA_HUMAN\n",
    "kinDomain=\"FERIKTLGTGSFGRVMLVKHKETGNHYAMKILDKQKVVKLKQIEHTLNEKRILQAVNFPFLVKLEFSFKDNSNLYMVMEYVPGGEMFSHLRRIGRFSEPHARFYAAQIVLTFEYLHSLDLIYRDLKPENLLIDQQGYIQVTDFGFAKRVKGRTWTLCGTPEYLAPEIILSKGYNKAVDWWALGVLIYEMAAGYPPFFADQPIQIYEKIVSGKVRFPSHFSSDLKDLLRNLLQVDLTKRFGNLKNGVNDIKNHKWF\"\n",
    "# P53602_S96_LARKRRNSRDGDPLP\n",
    "substrate=\"LARKRRNSRDGDPLP\"\n",
    "\n",
    "phosST(kinDomain,substrate).to_csv('PostiveExample.csv')\n",
    "#the score should be listed in the csv file aswell"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1f27f0f-5bda-4107-adef-a8712ace540c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "f432840c-f56e-40f2-959f-157dc65f57d6",
   "metadata": {},
   "source": [
    "# Negitive Example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41e2c0de-9088-4cf1-a744-a451ce19d7a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# P17612 KAPCA_HUMAN\n",
    "kinDomain=\"FERIKTLGTGSFGRVMLVKHKETGNHYAMKILDKQKVVKLKQIEHTLNEKRILQAVNFPFLVKLEFSFKDNSNLYMVMEYVPGGEMFSHLRRIGRFSEPHARFYAAQIVLTFEYLHSLDLIYRDLKPENLLIDQQGYIQVTDFGFAKRVKGRTWTLCGTPEYLAPEIILSKGYNKAVDWWALGVLIYEMAAGYPPFFADQPIQIYEKIVSGKVRFPSHFSSDLKDLLRNLLQVDLTKRFGNLKNGVNDIKNHKWF\"\n",
    "# 'Q01831_T169_PVEIEIETPEQAKTR'\n",
    "substrate=\"PVEIEIETPEQAKTR\"\n",
    "\n",
    "phosST(kinDomain,substrate).to_csv('NegitiveExample.csv')\n",
    "#the score should be listed in the csv file aswell"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac3b5b10-3cde-4f66-ba7a-f137538fa880",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85509eea-3217-492f-bf77-9da8ee123b76",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9bfe47df-7e6b-487b-92b6-33ba2d9c6eb7",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c6c2b239-f7d1-418b-bd1d-916fb1db8933",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5af09564-6b23-4dea-a0a3-76bc8362b7b4",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}