Spaces:

armheb
/

Albert_Protein_profile

Runtime error

App Files Files Community

Albert_Protein_profile / app.py

armheb

update representations \t

5f3e2ac about 2 years ago

raw history blame contribute delete

No virus

4.22 kB

	from calendar import c
	from transformers import AlbertForMaskedLM, AlbertTokenizer, pipeline
	import numpy as np
	import gradio as gr


	## Load the model
	tokenizer = AlbertTokenizer.from_pretrained("Rostlab/prot_albert", do_lower_case=False )
	model = AlbertForMaskedLM.from_pretrained("Rostlab/prot_albert")

	#pipeline
	fill_mask = pipeline("fill-mask", model = model, tokenizer=tokenizer, top_k = 21)

	## Initialization
	header=['SeqNo','PDB','No','V','L','I','M','F','W','Y','G','A','P','S','T','C','H','R','K','Q','E','N','D','NOCC','NDEL','NINS','ENTROPY','RELENT','WEIGHT','CHAIN','AUTHCHAIN']
	rem=[]
	codes = ['V','L','I','M','F','W','Y','G','A','P','S','T','C','H','R','K','Q','E','N','D','X']
	Hash1={

	'V':0,
	'L':1,
	'I':2,
	'M':3,
	'F':4,
	'W':5,
	'Y':6,
	'G':7,
	'A':8,
	'P':9,
	'S':10,
	'T':11,
	'C':12,
	'H':13,
	'R':14,
	'K':15,
	'Q':16,
	'E':17,
	'N':18,
	'D':19,
	'X':20
	}



	def ReadfastaFile(filename):
	seq=[]
	name=[]
	human=""
	fn=open(filename,"r")
	S=""
	for h in fn:
	h=h.rstrip()
	if not ">" in h:
	S=S+h
	fn.close()
	S=S.upper()
	return(S)


	def Predict_profile(sequence, header = header,rem=rem,Hash1 = Hash1):
	f=list()
	f.append("PDBNO"+"\t")
	for i in range(3,23):
	f.append(header[i]+"\t")
	f.append("X\n")

	a = (len(Hash1),len(sequence))
	pred_Profile=np.zeros(a)
	for i in range(len(sequence)):
	if i not in rem:
	T=np.copy(list(sequence))
	T=" ".join(T)
	T=T.split(" ")
	T[i]='[MASK]'
	T=" ".join(T)
	l=fill_mask(T)
	number=len(l)
	for k in range(number):
	token=l[k]['token_str']
	token=token.replace("▁","")
	score=l[k]['score']
	if token not in Hash1:
	print(i,token)

	else:
	pred_Profile[Hash1[token]][i]=int(score*100)
	f.append(str(i+1))
	for k in range(len(Hash1)): #without X
	f.append("\t"+str(pred_Profile[k][i]))
	f.append("\n")
	print(i)
	if len(rem)!=0:
	pred_Profile=np.delete(pred_Profile,rem,1)
	return(pred_Profile)

	def Predict_profile1(sequence, header = header,rem=rem,Hash1 = Hash1):
	f=list()
	f.append("PDBNO"+"\t")
	for i in range(3,23):
	f.append(header[i]+"\t")
	f.append("X\n")

	a = (len(Hash1),len(sequence))
	pred_Profile=np.zeros(a)
	for i in range(len(sequence)):
	if i not in rem:
	T=np.copy(list(sequence))
	T=" ".join(T)
	T=T.split(" ")
	T[i]='[MASK]'
	T=" ".join(T)
	l=fill_mask(T)
	number=len(l)
	for k in range(number):
	token=l[k]['token_str']
	token=token.replace("▁","")
	score=l[k]['score']
	if token not in Hash1:
	pred_Profile['X'][i]=pred_Profile['X'][i]+score

	else:
	pred_Profile[Hash1[token]][i]=score
	f.append(str(i+1))
	for k in range(len(Hash1)): #without X
	f.append("\t"+str(pred_Profile[k][i]))
	f.append("\n")
	print(i)
	if len(rem)!=0:
	pred_Profile=np.delete(pred_Profile,rem,1)
	return(pred_Profile)

	def print_func(sequence):
	s = Predict_profile1(sequence)
	ss = list(s)
	final = []
	for i in range(len(s)):
	# q= np.concatenate((codes[i],s[i]))
	q = [str(codes[i])] + str(ss[i]).replace('[','').replace(']','').split(" ")
	final.append(q)
	res = "\n".join(" ".join(str(el) for el in row) for row in final)
	return res


	title="Protein sequence profile prediction using ProtAlbert transformer"
	description="""Please enter the sequence.
	* Prediction process can take longer for long sequences.
	"""



	iface = gr.Interface(fn=print_func,
	inputs=["text"],
	outputs="text",
	description=description,
	title=title)
	iface.launch()