# Phosformer-ST Example Code

## imports

In [None]:
import os
import sys
import hashlib
import warnings
sys.dont_write_bytecode=True

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch

from tokenization_esm import EsmTokenizer
from modeling_esm import EsmForSequenceClassificationMHACustom
#for versioning spesfics see ReadMe 


## loading in pre-trained model

In [2]:
model_dir = 'multitask_MHA_esm2_t30_150M_UR50D_neg_ratio_8+8_shift_30_mask_0.2_2023-03-25_90'

tokenizer = EsmTokenizer.from_pretrained(model_dir)
model = EsmForSequenceClassificationMHACustom.from_pretrained(model_dir, num_labels=2)



## configureing paramaters of the Phos-ST model

## also orginizing the data for the input into Phos-ST 

In [3]:
def run_model(peptides, kinases, model=model, tokenizer=tokenizer, device='cuda', batch_size=50, output_hidden_states=True, output_attentions=True):
 torch.cuda.empty_cache()
 
 model.eval()
 model = model.to(device)
 
 size = len(peptides)
 breaks = set(np.cumsum([batch_size]*(size//batch_size)+[size%batch_size])-1)

 pairs = []
 for n, pair in enumerate(zip(peptides, kinases)):
 sys.stderr.write(f'{1+n}\r')
 pairs += [pair]
 if n in breaks:
 
 output = dict(zip(('peptide','kinase'),zip(*pairs)))
 ids = tokenizer(pairs, padding=True, return_tensors='pt')
 ids = ids.to(device)
 
 with torch.no_grad():
 results, classifier_attn_outputs, classifier_attn_output_weights = model(ids['input_ids'], 
 attention_mask=ids['attention_mask'], 
 output_hidden_states=output_hidden_states, 
 output_attentions=output_attentions)
 
 attention_mask = ids['attention_mask'].cpu().type(torch.bool)

 output['probability'] = results['logits'].softmax(1)[:,1].cpu().numpy()
 
 if output_hidden_states:
 last_embeddings = results['hidden_states'][-1].cpu().numpy()
 output['embedding'] = [i[m] for i, m in zip(last_embeddings, attention_mask)]
 
 if output_attentions:
 last_attentions = results['attentions'][-1].cpu().numpy()
 output['attention'] = [i[:,m,:][:,:,m] for i, m in zip(last_attentions, attention_mask)]
 
 classifier_attn_outputs = classifier_attn_outputs.cpu()
 output['classifier_attn_outputs'] = classifier_attn_outputs

 classifier_attn_output_weights = classifier_attn_output_weights.cpu()
 output['classifier_attn_output_weights'] = [i[:,m[16:]] for i, m in zip(classifier_attn_output_weights, attention_mask)]
 
 keys = output.keys()
 for data in zip(*(output[k] for k in keys)):
 yield dict(zip(keys, data))
 
 pairs = []


## helper funtion to use Phos-ST

In [4]:
# this could be modified to take in a list of substrate and kinase domains
# just drop the square brackets on the kinaseDomainSeq variable and substrate15mer variable around the job fuction's 1st and 2nd argument
def phosST(kinaseDomainSeq,substrate15mer):
 job = run_model(
 [substrate15mer],
 [kinaseDomainSeq],
 model=model, 
 tokenizer=tokenizer, 
 device='cuda', 
 batch_size=10,
 output_hidden_states=False,
 output_attentions=False,
 )
 
 #total = dataset.shape[0]
 results = {
 'kinase' : [],
 'peptide' : [],
 'prob' : [],
 }

 
 for n, i in enumerate(job):
 #sys.stderr.write(f'{n+1} / {total}\r')
 results['kinase' ] += [i['kinase']]
 results['peptide'] += [i['peptide']]
 results['prob' ] += [i['probability']]
 
 result = pd.DataFrame(results)
 print("The Predictive score is "+str(i['probability']))
 
 return result
 

# Postive Example

In [None]:
# P17612 KAPCA_HUMAN
kinDomain="FERIKTLGTGSFGRVMLVKHKETGNHYAMKILDKQKVVKLKQIEHTLNEKRILQAVNFPFLVKLEFSFKDNSNLYMVMEYVPGGEMFSHLRRIGRFSEPHARFYAAQIVLTFEYLHSLDLIYRDLKPENLLIDQQGYIQVTDFGFAKRVKGRTWTLCGTPEYLAPEIILSKGYNKAVDWWALGVLIYEMAAGYPPFFADQPIQIYEKIVSGKVRFPSHFSSDLKDLLRNLLQVDLTKRFGNLKNGVNDIKNHKWF"
# P53602_S96_LARKRRNSRDGDPLP
substrate="LARKRRNSRDGDPLP"

phosST(kinDomain,substrate).to_csv('PostiveExample.csv')
#the score should be listed in the csv file aswell

# Negitive Example

In [None]:
# P17612 KAPCA_HUMAN
kinDomain="FERIKTLGTGSFGRVMLVKHKETGNHYAMKILDKQKVVKLKQIEHTLNEKRILQAVNFPFLVKLEFSFKDNSNLYMVMEYVPGGEMFSHLRRIGRFSEPHARFYAAQIVLTFEYLHSLDLIYRDLKPENLLIDQQGYIQVTDFGFAKRVKGRTWTLCGTPEYLAPEIILSKGYNKAVDWWALGVLIYEMAAGYPPFFADQPIQIYEKIVSGKVRFPSHFSSDLKDLLRNLLQVDLTKRFGNLKNGVNDIKNHKWF"
# 'Q01831_T169_PVEIEIETPEQAKTR'
substrate="PVEIEIETPEQAKTR"

phosST(kinDomain,substrate).to_csv('NegitiveExample.csv')
#the score should be listed in the csv file aswell