Spaces:

merle
/

PROTEIN_GENERATOR

Running on T4

PROTEIN_GENERATOR / utils /model /parsers.py

Jacob Gershon

new b

59a9ccf over 1 year ago

6.61 kB

	import numpy as np
	import scipy
	import scipy.spatial
	import string
	import os,re
	import random
	import util
	import gzip

	to1letter = {
	"ALA":'A', "ARG":'R', "ASN":'N', "ASP":'D', "CYS":'C',
	"GLN":'Q', "GLU":'E', "GLY":'G', "HIS":'H', "ILE":'I',
	"LEU":'L', "LYS":'K', "MET":'M', "PHE":'F', "PRO":'P',
	"SER":'S', "THR":'T', "TRP":'W', "TYR":'Y', "VAL":'V' }

	# read A3M and convert letters into
	# integers in the 0..20 range,
	# also keep track of insertions
	def parse_a3m(filename):

	msa = []
	ins = []

	table = str.maketrans(dict.fromkeys(string.ascii_lowercase))

	#print(filename)

	if filename.split('.')[-1] == 'gz':
	fp = gzip.open(filename, 'rt')
	else:
	fp = open(filename, 'r')

	# read file line by line
	for line in fp:

	# skip labels
	if line[0] == '>':
	continue

	# remove right whitespaces
	line = line.rstrip()

	if len(line) == 0:
	continue

	# remove lowercase letters and append to MSA
	msa.append(line.translate(table))

	# sequence length
	L = len(msa[-1])

	# 0 - match or gap; 1 - insertion
	a = np.array([0 if c.isupper() or c=='-' else 1 for c in line])
	i = np.zeros((L))

	if np.sum(a) > 0:
	# positions of insertions
	pos = np.where(a==1)[0]

	# shift by occurrence
	a = pos - np.arange(pos.shape[0])

	# position of insertions in cleaned sequence
	# and their length
	pos,num = np.unique(a, return_counts=True)

	# append to the matrix of insetions
	i[pos] = num

	ins.append(i)
	if len(msa) == 10000:
	break

	# convert letters into numbers
	alphabet = np.array(list("ARNDCQEGHILKMFPSTWYV-"), dtype='\|S1').view(np.uint8)
	msa = np.array([list(s) for s in msa], dtype='\|S1').view(np.uint8)
	for i in range(alphabet.shape[0]):
	msa[msa == alphabet[i]] = i

	# treat all unknown characters as gaps
	msa[msa > 20] = 20

	ins = np.array(ins, dtype=np.uint8)

	return msa,ins


	# read and extract xyz coords of N,Ca,C atoms
	# from a PDB file
	def parse_pdb(filename):
	lines = open(filename,'r').readlines()
	return parse_pdb_lines(lines)

	#'''
	def parse_pdb_lines(lines):

	# indices of residues observed in the structure
	idx_s = [int(l[22:26]) for l in lines if l[:4]=="ATOM" and l[12:16].strip()=="CA"]

	# 4 BB + up to 10 SC atoms
	xyz = np.full((len(idx_s), 14, 3), np.nan, dtype=np.float32)
	for l in lines:
	if l[:4] != "ATOM":
	continue
	resNo, atom, aa = int(l[22:26]), l[12:16], l[17:20]
	idx = idx_s.index(resNo)
	for i_atm, tgtatm in enumerate(util.aa2long[util.aa2num[aa]]):
	if tgtatm == atom:
	xyz[idx,i_atm,:] = [float(l[30:38]), float(l[38:46]), float(l[46:54])]
	break

	# save atom mask
	mask = np.logical_not(np.isnan(xyz[...,0]))
	xyz[np.isnan(xyz[...,0])] = 0.0

	return xyz,mask,np.array(idx_s)
	#'''

	'''
	def parse_pdb_lines(lines):

	# indices of residues observed in the structure
	#idx_s = [int(l[22:26]) for l in lines if l[:4]=="ATOM" and l[12:16].strip()=="CA"]
	res = [(l[22:26],l[17:20]) for l in lines if l[:4]=="ATOM" and l[12:16].strip()=="CA"]
	idx_s = [int(r[0]) for r in res]
	seq = [util.aa2num[r[1]] if r[1] in util.aa2num.keys() else 20 for r in res]

	# 4 BB + up to 10 SC atoms
	xyz = np.full((len(idx_s), 14, 3), np.nan, dtype=np.float32)
	for l in lines:
	if l[:4] != "ATOM":
	continue
	resNo, atom, aa = int(l[22:26]), l[12:16], l[17:20]
	idx = idx_s.index(resNo)
	for i_atm, tgtatm in enumerate(util.aa2long[util.aa2num[aa]]):
	if tgtatm == atom:
	xyz[idx,i_atm,:] = [float(l[30:38]), float(l[38:46]), float(l[46:54])]
	break

	# save atom mask
	mask = np.logical_not(np.isnan(xyz[...,0]))
	xyz[np.isnan(xyz[...,0])] = 0.0

	return xyz,mask,np.array(idx_s), np.array(seq)
	'''


	def parse_templates(item, params):

	# init FFindexDB of templates
	### and extract template IDs
	### present in the DB
	ffdb = FFindexDB(read_index(params['FFDB']+'_pdb.ffindex'),
	read_data(params['FFDB']+'_pdb.ffdata'))
	#ffids = set([i.name for i in ffdb.index])

	# process tabulated hhsearch output to get
	# matched positions and positional scores
	infile = params['DIR']+'/hhr/'+item[-2:]+'/'+item+'.atab'
	hits = []
	for l in open(infile, "r").readlines():
	if l[0]=='>':
	key = l[1:].split()[0]
	hits.append([key,[],[]])
	elif "score" in l or "dssp" in l:
	continue
	else:
	hi = l.split()[:5]+[0.0,0.0,0.0]
	hits[-1][1].append([int(hi[0]),int(hi[1])])
	hits[-1][2].append([float(hi[2]),float(hi[3]),float(hi[4])])

	# get per-hit statistics from an .hhr file
	# (!!! assume that .hhr and .atab have the same hits !!!)
	# [Probab, E-value, Score, Aligned_cols,
	# Identities, Similarity, Sum_probs, Template_Neff]
	lines = open(infile[:-4]+'hhr', "r").readlines()
	pos = [i+1 for i,l in enumerate(lines) if l[0]=='>']
	for i,posi in enumerate(pos):
	hits[i].append([float(s) for s in re.sub('[=%]',' ',lines[posi]).split()[1::2]])

	# parse templates from FFDB
	for hi in hits:
	#if hi[0] not in ffids:
	# continue
	entry = get_entry_by_name(hi[0], ffdb.index)
	if entry == None:
	continue
	data = read_entry_lines(entry, ffdb.data)
	hi += list(parse_pdb_lines(data))

	# process hits
	counter = 0
	xyz,qmap,mask,f0d,f1d,ids = [],[],[],[],[],[]
	for data in hits:
	if len(data)<7:
	continue

	qi,ti = np.array(data[1]).T
	_,sel1,sel2 = np.intersect1d(ti, data[6], return_indices=True)
	ncol = sel1.shape[0]
	if ncol < 10:
	continue

	ids.append(data[0])
	f0d.append(data[3])
	f1d.append(np.array(data[2])[sel1])
	xyz.append(data[4][sel2])
	mask.append(data[5][sel2])
	qmap.append(np.stack([qi[sel1]-1,[counter]*ncol],axis=-1))
	counter += 1

	xyz = np.vstack(xyz).astype(np.float32)
	mask = np.vstack(mask).astype(np.bool)
	qmap = np.vstack(qmap).astype(np.long)
	f0d = np.vstack(f0d).astype(np.float32)
	f1d = np.vstack(f1d).astype(np.float32)
	ids = ids

	return xyz,mask,qmap,f0d,f1d,ids