Spaces:
Running
Running
import argparse | |
def main(args): | |
from dateutil import parser | |
import numpy as np | |
import os, time, gzip, json | |
import glob | |
folder_with_pdbs_path = args.input_path | |
save_path = args.output_path | |
alpha_1 = list("ARNDCQEGHILKMFPSTWYV-") | |
states = len(alpha_1) | |
alpha_3 = ['ALA','ARG','ASN','ASP','CYS','GLN','GLU','GLY','HIS','ILE', | |
'LEU','LYS','MET','PHE','PRO','SER','THR','TRP','TYR','VAL','GAP'] | |
aa_1_N = {a:n for n,a in enumerate(alpha_1)} | |
aa_3_N = {a:n for n,a in enumerate(alpha_3)} | |
aa_N_1 = {n:a for n,a in enumerate(alpha_1)} | |
aa_1_3 = {a:b for a,b in zip(alpha_1,alpha_3)} | |
aa_3_1 = {b:a for a,b in zip(alpha_1,alpha_3)} | |
def AA_to_N(x): | |
# ["ARND"] -> [[0,1,2,3]] | |
x = np.array(x); | |
if x.ndim == 0: x = x[None] | |
return [[aa_1_N.get(a, states-1) for a in y] for y in x] | |
def N_to_AA(x): | |
# [[0,1,2,3]] -> ["ARND"] | |
x = np.array(x); | |
if x.ndim == 1: x = x[None] | |
return ["".join([aa_N_1.get(a,"-") for a in y]) for y in x] | |
def parse_PDB_biounits(x, atoms=['N','CA','C'], chain=None): | |
''' | |
input: x = PDB filename | |
atoms = atoms to extract (optional) | |
output: (length, atoms, coords=(x,y,z)), sequence | |
''' | |
xyz,seq,min_resn,max_resn = {},{},1e6,-1e6 | |
for line in open(x,"rb"): | |
line = line.decode("utf-8","ignore").rstrip() | |
if line[:6] == "HETATM" and line[17:17+3] == "MSE": | |
line = line.replace("HETATM","ATOM ") | |
line = line.replace("MSE","MET") | |
if line[:4] == "ATOM": | |
ch = line[21:22] | |
if ch == chain or chain is None: | |
atom = line[12:12+4].strip() | |
resi = line[17:17+3] | |
resn = line[22:22+5].strip() | |
x,y,z = [float(line[i:(i+8)]) for i in [30,38,46]] | |
if resn[-1].isalpha(): | |
resa,resn = resn[-1],int(resn[:-1])-1 | |
else: | |
resa,resn = "",int(resn)-1 | |
# resn = int(resn) | |
if resn < min_resn: | |
min_resn = resn | |
if resn > max_resn: | |
max_resn = resn | |
if resn not in xyz: | |
xyz[resn] = {} | |
if resa not in xyz[resn]: | |
xyz[resn][resa] = {} | |
if resn not in seq: | |
seq[resn] = {} | |
if resa not in seq[resn]: | |
seq[resn][resa] = resi | |
if atom not in xyz[resn][resa]: | |
xyz[resn][resa][atom] = np.array([x,y,z]) | |
# convert to numpy arrays, fill in missing values | |
seq_,xyz_ = [],[] | |
try: | |
for resn in range(min_resn,max_resn+1): | |
if resn in seq: | |
for k in sorted(seq[resn]): seq_.append(aa_3_N.get(seq[resn][k],20)) | |
else: seq_.append(20) | |
if resn in xyz: | |
for k in sorted(xyz[resn]): | |
for atom in atoms: | |
if atom in xyz[resn][k]: xyz_.append(xyz[resn][k][atom]) | |
else: xyz_.append(np.full(3,np.nan)) | |
else: | |
for atom in atoms: xyz_.append(np.full(3,np.nan)) | |
return np.array(xyz_).reshape(-1,len(atoms),3), N_to_AA(np.array(seq_)) | |
except TypeError: | |
return 'no_chain', 'no_chain' | |
pdb_dict_list = [] | |
c = 0 | |
if folder_with_pdbs_path[-1]!='/': | |
folder_with_pdbs_path = folder_with_pdbs_path+'/' | |
init_alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G','H', 'I', 'J','K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T','U', 'V','W','X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g','h', 'i', 'j','k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't','u', 'v','w','x', 'y', 'z'] | |
extra_alphabet = [str(item) for item in list(np.arange(300))] | |
chain_alphabet = init_alphabet + extra_alphabet | |
biounit_names = glob.glob(folder_with_pdbs_path+'*.pdb') | |
for biounit in biounit_names: | |
my_dict = {} | |
s = 0 | |
concat_seq = '' | |
concat_N = [] | |
concat_CA = [] | |
concat_C = [] | |
concat_O = [] | |
concat_mask = [] | |
coords_dict = {} | |
for letter in chain_alphabet: | |
xyz, seq = parse_PDB_biounits(biounit, atoms=['N','CA','C','O'], chain=letter) | |
if type(xyz) != str: | |
concat_seq += seq[0] | |
my_dict['seq_chain_'+letter]=seq[0] | |
coords_dict_chain = {} | |
coords_dict_chain['N_chain_'+letter]=xyz[:,0,:].tolist() | |
coords_dict_chain['CA_chain_'+letter]=xyz[:,1,:].tolist() | |
coords_dict_chain['C_chain_'+letter]=xyz[:,2,:].tolist() | |
coords_dict_chain['O_chain_'+letter]=xyz[:,3,:].tolist() | |
my_dict['coords_chain_'+letter]=coords_dict_chain | |
s += 1 | |
fi = biounit.rfind("/") | |
my_dict['name']=biounit[(fi+1):-4] | |
my_dict['num_of_chains'] = s | |
my_dict['seq'] = concat_seq | |
if s < len(chain_alphabet): | |
pdb_dict_list.append(my_dict) | |
c+=1 | |
with open(save_path, 'w') as f: | |
for entry in pdb_dict_list: | |
f.write(json.dumps(entry) + '\n') | |
if __name__ == "__main__": | |
argparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
argparser.add_argument("--input_path", type=str, help="Path to a folder with pdb files, e.g. /home/my_pdbs/") | |
argparser.add_argument("--output_path", type=str, help="Path where to save .jsonl dictionary of parsed pdbs") | |
args = argparser.parse_args() | |
main(args) | |