armheb's picture
update representations \t
5f3e2ac
from calendar import c
from transformers import AlbertForMaskedLM, AlbertTokenizer, pipeline
import numpy as np
import gradio as gr
## Load the model
tokenizer = AlbertTokenizer.from_pretrained("Rostlab/prot_albert", do_lower_case=False )
model = AlbertForMaskedLM.from_pretrained("Rostlab/prot_albert")
#pipeline
fill_mask = pipeline("fill-mask", model = model, tokenizer=tokenizer, top_k = 21)
## Initialization
header=['SeqNo','PDB','No','V','L','I','M','F','W','Y','G','A','P','S','T','C','H','R','K','Q','E','N','D','NOCC','NDEL','NINS','ENTROPY','RELENT','WEIGHT','CHAIN','AUTHCHAIN']
rem=[]
codes = ['V','L','I','M','F','W','Y','G','A','P','S','T','C','H','R','K','Q','E','N','D','X']
Hash1={
'V':0,
'L':1,
'I':2,
'M':3,
'F':4,
'W':5,
'Y':6,
'G':7,
'A':8,
'P':9,
'S':10,
'T':11,
'C':12,
'H':13,
'R':14,
'K':15,
'Q':16,
'E':17,
'N':18,
'D':19,
'X':20
}
def ReadfastaFile(filename):
seq=[]
name=[]
human=""
fn=open(filename,"r")
S=""
for h in fn:
h=h.rstrip()
if not ">" in h:
S=S+h
fn.close()
S=S.upper()
return(S)
def Predict_profile(sequence, header = header,rem=rem,Hash1 = Hash1):
f=list()
f.append("PDBNO"+"\t")
for i in range(3,23):
f.append(header[i]+"\t")
f.append("X\n")
a = (len(Hash1),len(sequence))
pred_Profile=np.zeros(a)
for i in range(len(sequence)):
if i not in rem:
T=np.copy(list(sequence))
T=" ".join(T)
T=T.split(" ")
T[i]='[MASK]'
T=" ".join(T)
l=fill_mask(T)
number=len(l)
for k in range(number):
token=l[k]['token_str']
token=token.replace("▁","")
score=l[k]['score']
if token not in Hash1:
print(i,token)
else:
pred_Profile[Hash1[token]][i]=int(score*100)
f.append(str(i+1))
for k in range(len(Hash1)): #without X
f.append("\t"+str(pred_Profile[k][i]))
f.append("\n")
print(i)
if len(rem)!=0:
pred_Profile=np.delete(pred_Profile,rem,1)
return(pred_Profile)
def Predict_profile1(sequence, header = header,rem=rem,Hash1 = Hash1):
f=list()
f.append("PDBNO"+"\t")
for i in range(3,23):
f.append(header[i]+"\t")
f.append("X\n")
a = (len(Hash1),len(sequence))
pred_Profile=np.zeros(a)
for i in range(len(sequence)):
if i not in rem:
T=np.copy(list(sequence))
T=" ".join(T)
T=T.split(" ")
T[i]='[MASK]'
T=" ".join(T)
l=fill_mask(T)
number=len(l)
for k in range(number):
token=l[k]['token_str']
token=token.replace("▁","")
score=l[k]['score']
if token not in Hash1:
pred_Profile['X'][i]=pred_Profile['X'][i]+score
else:
pred_Profile[Hash1[token]][i]=score
f.append(str(i+1))
for k in range(len(Hash1)): #without X
f.append("\t"+str(pred_Profile[k][i]))
f.append("\n")
print(i)
if len(rem)!=0:
pred_Profile=np.delete(pred_Profile,rem,1)
return(pred_Profile)
def print_func(sequence):
s = Predict_profile1(sequence)
ss = list(s)
final = []
for i in range(len(s)):
# q= np.concatenate((codes[i],s[i]))
q = [str(codes[i])] + str(ss[i]).replace('[','').replace(']','').split(" ")
final.append(q)
res = "\n".join(" ".join(str(el) for el in row) for row in final)
return res
title="Protein sequence profile prediction using ProtAlbert transformer"
description="""Please enter the sequence.
* Prediction process can take longer for long sequences.
"""
iface = gr.Interface(fn=print_func,
inputs=["text"],
outputs="text",
description=description,
title=title)
iface.launch()