{ "cells": [ { "cell_type": "markdown", "id": "b1b4e507-0ca2-4309-ac5c-0461f99edc72", "metadata": {}, "source": [ "# Phosformer-ST Example Code" ] }, { "cell_type": "markdown", "id": "3a23dd26-2060-4cb1-a1a0-dd97b168a329", "metadata": {}, "source": [ "## imports" ] }, { "cell_type": "code", "execution_count": null, "id": "ec3bd89c-8aa1-408c-b569-c89dc2bb768d", "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "import hashlib\n", "import warnings\n", "sys.dont_write_bytecode=True\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", "import torch\n", "\n", "from tokenization_esm import EsmTokenizer\n", "from modeling_esm import EsmForSequenceClassificationMHACustom\n", "#for versioning spesfics see ReadMe \n" ] }, { "cell_type": "markdown", "id": "a42e87fc-bd23-4b7b-8234-06cbdcb25bc0", "metadata": {}, "source": [ "## loading in pre-trained model" ] }, { "cell_type": "code", "execution_count": 2, "id": "9def3d4b-822d-44b8-a896-3e6ee5aca13d", "metadata": {}, "outputs": [], "source": [ "model_dir = 'multitask_MHA_esm2_t30_150M_UR50D_neg_ratio_8+8_shift_30_mask_0.2_2023-03-25_90'\n", "\n", "tokenizer = EsmTokenizer.from_pretrained(model_dir)\n", "model = EsmForSequenceClassificationMHACustom.from_pretrained(model_dir, num_labels=2)\n", "\n" ] }, { "cell_type": "markdown", "id": "ff3f7f18-6cb2-4818-9583-bb729e848b81", "metadata": {}, "source": [ "## configureing paramaters of the Phos-ST model\n", "\n", "## also orginizing the data for the input into Phos-ST " ] }, { "cell_type": "code", "execution_count": 3, "id": "1dbece0a-39a3-4932-8781-a679dd699587", "metadata": {}, "outputs": [], "source": [ "def run_model(peptides, kinases, model=model, tokenizer=tokenizer, device='cuda', batch_size=50, output_hidden_states=True, output_attentions=True):\n", " torch.cuda.empty_cache()\n", " \n", " model.eval()\n", " model = model.to(device)\n", " \n", " size = len(peptides)\n", " breaks = set(np.cumsum([batch_size]*(size//batch_size)+[size%batch_size])-1)\n", "\n", " pairs = []\n", " for n, pair in enumerate(zip(peptides, kinases)):\n", " sys.stderr.write(f'{1+n}\\r')\n", " pairs += [pair]\n", " if n in breaks:\n", " \n", " output = dict(zip(('peptide','kinase'),zip(*pairs)))\n", " ids = tokenizer(pairs, padding=True, return_tensors='pt')\n", " ids = ids.to(device)\n", " \n", " with torch.no_grad():\n", " results, classifier_attn_outputs, classifier_attn_output_weights = model(ids['input_ids'], \n", " attention_mask=ids['attention_mask'], \n", " output_hidden_states=output_hidden_states, \n", " output_attentions=output_attentions)\n", " \n", " attention_mask = ids['attention_mask'].cpu().type(torch.bool)\n", "\n", " output['probability'] = results['logits'].softmax(1)[:,1].cpu().numpy()\n", " \n", " if output_hidden_states:\n", " last_embeddings = results['hidden_states'][-1].cpu().numpy()\n", " output['embedding'] = [i[m] for i, m in zip(last_embeddings, attention_mask)]\n", " \n", " if output_attentions:\n", " last_attentions = results['attentions'][-1].cpu().numpy()\n", " output['attention'] = [i[:,m,:][:,:,m] for i, m in zip(last_attentions, attention_mask)]\n", " \n", " classifier_attn_outputs = classifier_attn_outputs.cpu()\n", " output['classifier_attn_outputs'] = classifier_attn_outputs\n", "\n", " classifier_attn_output_weights = classifier_attn_output_weights.cpu()\n", " output['classifier_attn_output_weights'] = [i[:,m[16:]] for i, m in zip(classifier_attn_output_weights, attention_mask)]\n", " \n", " keys = output.keys()\n", " for data in zip(*(output[k] for k in keys)):\n", " yield dict(zip(keys, data))\n", " \n", " pairs = []\n" ] }, { "cell_type": "markdown", "id": "3fbfc05d-970c-4db9-bae8-61ea2ffb06af", "metadata": {}, "source": [ "## helper funtion to use Phos-ST" ] }, { "cell_type": "code", "execution_count": 4, "id": "98e90bc6-28db-449c-8a0b-805ef22cd9ec", "metadata": {}, "outputs": [], "source": [ "# this could be modified to take in a list of substrate and kinase domains\n", "# just drop the square brackets on the kinaseDomainSeq variable and substrate15mer variable around the job fuction's 1st and 2nd argument\n", "def phosST(kinaseDomainSeq,substrate15mer):\n", " job = run_model(\n", " [substrate15mer],\n", " [kinaseDomainSeq],\n", " model=model, \n", " tokenizer=tokenizer, \n", " device='cuda', \n", " batch_size=10,\n", " output_hidden_states=False,\n", " output_attentions=False,\n", " )\n", " \n", " #total = dataset.shape[0]\n", " results = {\n", " 'kinase' : [],\n", " 'peptide' : [],\n", " 'prob' : [],\n", " }\n", "\n", " \n", " for n, i in enumerate(job):\n", " #sys.stderr.write(f'{n+1} / {total}\\r')\n", " results['kinase' ] += [i['kinase']]\n", " results['peptide'] += [i['peptide']]\n", " results['prob' ] += [i['probability']]\n", " \n", " result = pd.DataFrame(results)\n", " print(\"The Predictive score is \"+str(i['probability']))\n", " \n", " return result\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "151c217b-1ee1-4cf7-b41f-b52f5ce22719", "metadata": { "scrolled": true }, "outputs": [], "source": [ "\n" ] }, { "cell_type": "markdown", "id": "ee511f8b-5d9e-4c8a-8191-bd2a7fd3a5e9", "metadata": {}, "source": [ "# Postive Example" ] }, { "cell_type": "code", "execution_count": null, "id": "5bd6e8ee-444e-49d5-a617-d2343759759a", "metadata": {}, "outputs": [], "source": [ "# P17612 KAPCA_HUMAN\n", "kinDomain=\"FERIKTLGTGSFGRVMLVKHKETGNHYAMKILDKQKVVKLKQIEHTLNEKRILQAVNFPFLVKLEFSFKDNSNLYMVMEYVPGGEMFSHLRRIGRFSEPHARFYAAQIVLTFEYLHSLDLIYRDLKPENLLIDQQGYIQVTDFGFAKRVKGRTWTLCGTPEYLAPEIILSKGYNKAVDWWALGVLIYEMAAGYPPFFADQPIQIYEKIVSGKVRFPSHFSSDLKDLLRNLLQVDLTKRFGNLKNGVNDIKNHKWF\"\n", "# P53602_S96_LARKRRNSRDGDPLP\n", "substrate=\"LARKRRNSRDGDPLP\"\n", "\n", "phosST(kinDomain,substrate).to_csv('PostiveExample.csv')\n", "#the score should be listed in the csv file aswell" ] }, { "cell_type": "code", "execution_count": null, "id": "f1f27f0f-5bda-4107-adef-a8712ace540c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "f432840c-f56e-40f2-959f-157dc65f57d6", "metadata": {}, "source": [ "# Negitive Example" ] }, { "cell_type": "code", "execution_count": null, "id": "41e2c0de-9088-4cf1-a744-a451ce19d7a6", "metadata": {}, "outputs": [], "source": [ "# P17612 KAPCA_HUMAN\n", "kinDomain=\"FERIKTLGTGSFGRVMLVKHKETGNHYAMKILDKQKVVKLKQIEHTLNEKRILQAVNFPFLVKLEFSFKDNSNLYMVMEYVPGGEMFSHLRRIGRFSEPHARFYAAQIVLTFEYLHSLDLIYRDLKPENLLIDQQGYIQVTDFGFAKRVKGRTWTLCGTPEYLAPEIILSKGYNKAVDWWALGVLIYEMAAGYPPFFADQPIQIYEKIVSGKVRFPSHFSSDLKDLLRNLLQVDLTKRFGNLKNGVNDIKNHKWF\"\n", "# 'Q01831_T169_PVEIEIETPEQAKTR'\n", "substrate=\"PVEIEIETPEQAKTR\"\n", "\n", "phosST(kinDomain,substrate).to_csv('NegitiveExample.csv')\n", "#the score should be listed in the csv file aswell" ] }, { "cell_type": "code", "execution_count": null, "id": "ac3b5b10-3cde-4f66-ba7a-f137538fa880", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "85509eea-3217-492f-bf77-9da8ee123b76", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "9bfe47df-7e6b-487b-92b6-33ba2d9c6eb7", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "c6c2b239-f7d1-418b-bd1d-916fb1db8933", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "5af09564-6b23-4dea-a0a3-76bc8362b7b4", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 5 }