import argparse def main(args): from dateutil import parser import numpy as np import os, time, gzip, json import glob folder_with_pdbs_path = args.input_path save_path = args.output_path alpha_1 = list("ARNDCQEGHILKMFPSTWYV-") states = len(alpha_1) alpha_3 = ['ALA','ARG','ASN','ASP','CYS','GLN','GLU','GLY','HIS','ILE', 'LEU','LYS','MET','PHE','PRO','SER','THR','TRP','TYR','VAL','GAP'] aa_1_N = {a:n for n,a in enumerate(alpha_1)} aa_3_N = {a:n for n,a in enumerate(alpha_3)} aa_N_1 = {n:a for n,a in enumerate(alpha_1)} aa_1_3 = {a:b for a,b in zip(alpha_1,alpha_3)} aa_3_1 = {b:a for a,b in zip(alpha_1,alpha_3)} def AA_to_N(x): # ["ARND"] -> [[0,1,2,3]] x = np.array(x); if x.ndim == 0: x = x[None] return [[aa_1_N.get(a, states-1) for a in y] for y in x] def N_to_AA(x): # [[0,1,2,3]] -> ["ARND"] x = np.array(x); if x.ndim == 1: x = x[None] return ["".join([aa_N_1.get(a,"-") for a in y]) for y in x] def parse_PDB_biounits(x, atoms=['N','CA','C'], chain=None): ''' input: x = PDB filename atoms = atoms to extract (optional) output: (length, atoms, coords=(x,y,z)), sequence ''' xyz,seq,min_resn,max_resn = {},{},1e6,-1e6 for line in open(x,"rb"): line = line.decode("utf-8","ignore").rstrip() if line[:6] == "HETATM" and line[17:17+3] == "MSE": line = line.replace("HETATM","ATOM ") line = line.replace("MSE","MET") if line[:4] == "ATOM": ch = line[21:22] if ch == chain or chain is None: atom = line[12:12+4].strip() resi = line[17:17+3] resn = line[22:22+5].strip() x,y,z = [float(line[i:(i+8)]) for i in [30,38,46]] if resn[-1].isalpha(): resa,resn = resn[-1],int(resn[:-1])-1 else: resa,resn = "",int(resn)-1 # resn = int(resn) if resn < min_resn: min_resn = resn if resn > max_resn: max_resn = resn if resn not in xyz: xyz[resn] = {} if resa not in xyz[resn]: xyz[resn][resa] = {} if resn not in seq: seq[resn] = {} if resa not in seq[resn]: seq[resn][resa] = resi if atom not in xyz[resn][resa]: xyz[resn][resa][atom] = np.array([x,y,z]) # convert to numpy arrays, fill in missing values seq_,xyz_ = [],[] try: for resn in range(min_resn,max_resn+1): if resn in seq: for k in sorted(seq[resn]): seq_.append(aa_3_N.get(seq[resn][k],20)) else: seq_.append(20) if resn in xyz: for k in sorted(xyz[resn]): for atom in atoms: if atom in xyz[resn][k]: xyz_.append(xyz[resn][k][atom]) else: xyz_.append(np.full(3,np.nan)) else: for atom in atoms: xyz_.append(np.full(3,np.nan)) return np.array(xyz_).reshape(-1,len(atoms),3), N_to_AA(np.array(seq_)) except TypeError: return 'no_chain', 'no_chain' pdb_dict_list = [] c = 0 if folder_with_pdbs_path[-1]!='/': folder_with_pdbs_path = folder_with_pdbs_path+'/' init_alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G','H', 'I', 'J','K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T','U', 'V','W','X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g','h', 'i', 'j','k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't','u', 'v','w','x', 'y', 'z'] extra_alphabet = [str(item) for item in list(np.arange(300))] chain_alphabet = init_alphabet + extra_alphabet biounit_names = glob.glob(folder_with_pdbs_path+'*.pdb') for biounit in biounit_names: my_dict = {} s = 0 concat_seq = '' concat_N = [] concat_CA = [] concat_C = [] concat_O = [] concat_mask = [] coords_dict = {} for letter in chain_alphabet: xyz, seq = parse_PDB_biounits(biounit, atoms=['N','CA','C','O'], chain=letter) if type(xyz) != str: concat_seq += seq[0] my_dict['seq_chain_'+letter]=seq[0] coords_dict_chain = {} coords_dict_chain['N_chain_'+letter]=xyz[:,0,:].tolist() coords_dict_chain['CA_chain_'+letter]=xyz[:,1,:].tolist() coords_dict_chain['C_chain_'+letter]=xyz[:,2,:].tolist() coords_dict_chain['O_chain_'+letter]=xyz[:,3,:].tolist() my_dict['coords_chain_'+letter]=coords_dict_chain s += 1 fi = biounit.rfind("/") my_dict['name']=biounit[(fi+1):-4] my_dict['num_of_chains'] = s my_dict['seq'] = concat_seq if s < len(chain_alphabet): pdb_dict_list.append(my_dict) c+=1 with open(save_path, 'w') as f: for entry in pdb_dict_list: f.write(json.dumps(entry) + '\n') if __name__ == "__main__": argparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) argparser.add_argument("--input_path", type=str, help="Path to a folder with pdb files, e.g. /home/my_pdbs/") argparser.add_argument("--output_path", type=str, help="Path where to save .jsonl dictionary of parsed pdbs") args = argparser.parse_args() main(args)