from calendar import c from transformers import AlbertForMaskedLM, AlbertTokenizer, pipeline import numpy as np import gradio as gr ## Load the model tokenizer = AlbertTokenizer.from_pretrained("Rostlab/prot_albert", do_lower_case=False ) model = AlbertForMaskedLM.from_pretrained("Rostlab/prot_albert") #pipeline fill_mask = pipeline("fill-mask", model = model, tokenizer=tokenizer, top_k = 21) ## Initialization header=['SeqNo','PDB','No','V','L','I','M','F','W','Y','G','A','P','S','T','C','H','R','K','Q','E','N','D','NOCC','NDEL','NINS','ENTROPY','RELENT','WEIGHT','CHAIN','AUTHCHAIN'] rem=[] codes = ['V','L','I','M','F','W','Y','G','A','P','S','T','C','H','R','K','Q','E','N','D','X'] Hash1={ 'V':0, 'L':1, 'I':2, 'M':3, 'F':4, 'W':5, 'Y':6, 'G':7, 'A':8, 'P':9, 'S':10, 'T':11, 'C':12, 'H':13, 'R':14, 'K':15, 'Q':16, 'E':17, 'N':18, 'D':19, 'X':20 } def ReadfastaFile(filename): seq=[] name=[] human="" fn=open(filename,"r") S="" for h in fn: h=h.rstrip() if not ">" in h: S=S+h fn.close() S=S.upper() return(S) def Predict_profile(sequence, header = header,rem=rem,Hash1 = Hash1): f=list() f.append("PDBNO"+"\t") for i in range(3,23): f.append(header[i]+"\t") f.append("X\n") a = (len(Hash1),len(sequence)) pred_Profile=np.zeros(a) for i in range(len(sequence)): if i not in rem: T=np.copy(list(sequence)) T=" ".join(T) T=T.split(" ") T[i]='[MASK]' T=" ".join(T) l=fill_mask(T) number=len(l) for k in range(number): token=l[k]['token_str'] token=token.replace("▁","") score=l[k]['score'] if token not in Hash1: print(i,token) else: pred_Profile[Hash1[token]][i]=int(score*100) f.append(str(i+1)) for k in range(len(Hash1)): #without X f.append("\t"+str(pred_Profile[k][i])) f.append("\n") print(i) if len(rem)!=0: pred_Profile=np.delete(pred_Profile,rem,1) return(pred_Profile) def Predict_profile1(sequence, header = header,rem=rem,Hash1 = Hash1): f=list() f.append("PDBNO"+"\t") for i in range(3,23): f.append(header[i]+"\t") f.append("X\n") a = (len(Hash1),len(sequence)) pred_Profile=np.zeros(a) for i in range(len(sequence)): if i not in rem: T=np.copy(list(sequence)) T=" ".join(T) T=T.split(" ") T[i]='[MASK]' T=" ".join(T) l=fill_mask(T) number=len(l) for k in range(number): token=l[k]['token_str'] token=token.replace("▁","") score=l[k]['score'] if token not in Hash1: pred_Profile['X'][i]=pred_Profile['X'][i]+score else: pred_Profile[Hash1[token]][i]=score f.append(str(i+1)) for k in range(len(Hash1)): #without X f.append("\t"+str(pred_Profile[k][i])) f.append("\n") print(i) if len(rem)!=0: pred_Profile=np.delete(pred_Profile,rem,1) return(pred_Profile) def print_func(sequence): s = Predict_profile1(sequence) ss = list(s) final = [] for i in range(len(s)): # q= np.concatenate((codes[i],s[i])) q = [str(codes[i])] + str(ss[i]).replace('[','').replace(']','').split(" ") final.append(q) res = "\n".join(" ".join(str(el) for el in row) for row in final) return res title="Protein sequence profile prediction using ProtAlbert transformer" description="""Please enter the sequence. * Prediction process can take longer for long sequences. """ iface = gr.Interface(fn=print_func, inputs=["text"], outputs="text", description=description, title=title) iface.launch()