import numpy as np import scipy import scipy.spatial import string import os,re import random import util import gzip to1letter = { "ALA":'A', "ARG":'R', "ASN":'N', "ASP":'D', "CYS":'C', "GLN":'Q', "GLU":'E', "GLY":'G', "HIS":'H', "ILE":'I', "LEU":'L', "LYS":'K', "MET":'M', "PHE":'F', "PRO":'P', "SER":'S', "THR":'T', "TRP":'W', "TYR":'Y', "VAL":'V' } # read A3M and convert letters into # integers in the 0..20 range, # also keep track of insertions def parse_a3m(filename): msa = [] ins = [] table = str.maketrans(dict.fromkeys(string.ascii_lowercase)) #print(filename) if filename.split('.')[-1] == 'gz': fp = gzip.open(filename, 'rt') else: fp = open(filename, 'r') # read file line by line for line in fp: # skip labels if line[0] == '>': continue # remove right whitespaces line = line.rstrip() if len(line) == 0: continue # remove lowercase letters and append to MSA msa.append(line.translate(table)) # sequence length L = len(msa[-1]) # 0 - match or gap; 1 - insertion a = np.array([0 if c.isupper() or c=='-' else 1 for c in line]) i = np.zeros((L)) if np.sum(a) > 0: # positions of insertions pos = np.where(a==1)[0] # shift by occurrence a = pos - np.arange(pos.shape[0]) # position of insertions in cleaned sequence # and their length pos,num = np.unique(a, return_counts=True) # append to the matrix of insetions i[pos] = num ins.append(i) if len(msa) == 10000: break # convert letters into numbers alphabet = np.array(list("ARNDCQEGHILKMFPSTWYV-"), dtype='|S1').view(np.uint8) msa = np.array([list(s) for s in msa], dtype='|S1').view(np.uint8) for i in range(alphabet.shape[0]): msa[msa == alphabet[i]] = i # treat all unknown characters as gaps msa[msa > 20] = 20 ins = np.array(ins, dtype=np.uint8) return msa,ins # read and extract xyz coords of N,Ca,C atoms # from a PDB file def parse_pdb(filename): lines = open(filename,'r').readlines() return parse_pdb_lines(lines) #''' def parse_pdb_lines(lines): # indices of residues observed in the structure idx_s = [int(l[22:26]) for l in lines if l[:4]=="ATOM" and l[12:16].strip()=="CA"] # 4 BB + up to 10 SC atoms xyz = np.full((len(idx_s), 14, 3), np.nan, dtype=np.float32) for l in lines: if l[:4] != "ATOM": continue resNo, atom, aa = int(l[22:26]), l[12:16], l[17:20] idx = idx_s.index(resNo) for i_atm, tgtatm in enumerate(util.aa2long[util.aa2num[aa]]): if tgtatm == atom: xyz[idx,i_atm,:] = [float(l[30:38]), float(l[38:46]), float(l[46:54])] break # save atom mask mask = np.logical_not(np.isnan(xyz[...,0])) xyz[np.isnan(xyz[...,0])] = 0.0 return xyz,mask,np.array(idx_s) #''' ''' def parse_pdb_lines(lines): # indices of residues observed in the structure #idx_s = [int(l[22:26]) for l in lines if l[:4]=="ATOM" and l[12:16].strip()=="CA"] res = [(l[22:26],l[17:20]) for l in lines if l[:4]=="ATOM" and l[12:16].strip()=="CA"] idx_s = [int(r[0]) for r in res] seq = [util.aa2num[r[1]] if r[1] in util.aa2num.keys() else 20 for r in res] # 4 BB + up to 10 SC atoms xyz = np.full((len(idx_s), 14, 3), np.nan, dtype=np.float32) for l in lines: if l[:4] != "ATOM": continue resNo, atom, aa = int(l[22:26]), l[12:16], l[17:20] idx = idx_s.index(resNo) for i_atm, tgtatm in enumerate(util.aa2long[util.aa2num[aa]]): if tgtatm == atom: xyz[idx,i_atm,:] = [float(l[30:38]), float(l[38:46]), float(l[46:54])] break # save atom mask mask = np.logical_not(np.isnan(xyz[...,0])) xyz[np.isnan(xyz[...,0])] = 0.0 return xyz,mask,np.array(idx_s), np.array(seq) ''' def parse_templates(item, params): # init FFindexDB of templates ### and extract template IDs ### present in the DB ffdb = FFindexDB(read_index(params['FFDB']+'_pdb.ffindex'), read_data(params['FFDB']+'_pdb.ffdata')) #ffids = set([i.name for i in ffdb.index]) # process tabulated hhsearch output to get # matched positions and positional scores infile = params['DIR']+'/hhr/'+item[-2:]+'/'+item+'.atab' hits = [] for l in open(infile, "r").readlines(): if l[0]=='>': key = l[1:].split()[0] hits.append([key,[],[]]) elif "score" in l or "dssp" in l: continue else: hi = l.split()[:5]+[0.0,0.0,0.0] hits[-1][1].append([int(hi[0]),int(hi[1])]) hits[-1][2].append([float(hi[2]),float(hi[3]),float(hi[4])]) # get per-hit statistics from an .hhr file # (!!! assume that .hhr and .atab have the same hits !!!) # [Probab, E-value, Score, Aligned_cols, # Identities, Similarity, Sum_probs, Template_Neff] lines = open(infile[:-4]+'hhr', "r").readlines() pos = [i+1 for i,l in enumerate(lines) if l[0]=='>'] for i,posi in enumerate(pos): hits[i].append([float(s) for s in re.sub('[=%]',' ',lines[posi]).split()[1::2]]) # parse templates from FFDB for hi in hits: #if hi[0] not in ffids: # continue entry = get_entry_by_name(hi[0], ffdb.index) if entry == None: continue data = read_entry_lines(entry, ffdb.data) hi += list(parse_pdb_lines(data)) # process hits counter = 0 xyz,qmap,mask,f0d,f1d,ids = [],[],[],[],[],[] for data in hits: if len(data)<7: continue qi,ti = np.array(data[1]).T _,sel1,sel2 = np.intersect1d(ti, data[6], return_indices=True) ncol = sel1.shape[0] if ncol < 10: continue ids.append(data[0]) f0d.append(data[3]) f1d.append(np.array(data[2])[sel1]) xyz.append(data[4][sel2]) mask.append(data[5][sel2]) qmap.append(np.stack([qi[sel1]-1,[counter]*ncol],axis=-1)) counter += 1 xyz = np.vstack(xyz).astype(np.float32) mask = np.vstack(mask).astype(np.bool) qmap = np.vstack(qmap).astype(np.long) f0d = np.vstack(f0d).astype(np.float32) f1d = np.vstack(f1d).astype(np.float32) ids = ids return xyz,mask,qmap,f0d,f1d,ids