Spaces:

simonduerr
/

ProteinMPNN

Running on T4

File size: 5,652 Bytes

e1a6cd9

import argparse

def main(args):

    from dateutil import parser
    import numpy as np
    import os, time, gzip, json
    import glob 
    
    folder_with_pdbs_path = args.input_path
    save_path = args.output_path
    
    alpha_1 = list("ARNDCQEGHILKMFPSTWYV-")
    states = len(alpha_1)
    alpha_3 = ['ALA','ARG','ASN','ASP','CYS','GLN','GLU','GLY','HIS','ILE',
               'LEU','LYS','MET','PHE','PRO','SER','THR','TRP','TYR','VAL','GAP']
    
    aa_1_N = {a:n for n,a in enumerate(alpha_1)}
    aa_3_N = {a:n for n,a in enumerate(alpha_3)}
    aa_N_1 = {n:a for n,a in enumerate(alpha_1)}
    aa_1_3 = {a:b for a,b in zip(alpha_1,alpha_3)}
    aa_3_1 = {b:a for a,b in zip(alpha_1,alpha_3)}
    
    def AA_to_N(x):
      # ["ARND"] -> [[0,1,2,3]]
      x = np.array(x);
      if x.ndim == 0: x = x[None]
      return [[aa_1_N.get(a, states-1) for a in y] for y in x]
    
    def N_to_AA(x):
      # [[0,1,2,3]] -> ["ARND"]
      x = np.array(x);
      if x.ndim == 1: x = x[None]
      return ["".join([aa_N_1.get(a,"-") for a in y]) for y in x]
    
    
    def parse_PDB_biounits(x, atoms=['N','CA','C'], chain=None):
      '''
      input:  x = PDB filename
              atoms = atoms to extract (optional)
      output: (length, atoms, coords=(x,y,z)), sequence
      '''
      xyz,seq,min_resn,max_resn = {},{},1e6,-1e6
      for line in open(x,"rb"):
        line = line.decode("utf-8","ignore").rstrip()
    
        if line[:6] == "HETATM" and line[17:17+3] == "MSE":
          line = line.replace("HETATM","ATOM  ")
          line = line.replace("MSE","MET")
    
        if line[:4] == "ATOM":
          ch = line[21:22]
          if ch == chain or chain is None:
            atom = line[12:12+4].strip()
            resi = line[17:17+3]
            resn = line[22:22+5].strip()
            x,y,z = [float(line[i:(i+8)]) for i in [30,38,46]]
    
            if resn[-1].isalpha(): 
                resa,resn = resn[-1],int(resn[:-1])-1
            else: 
                resa,resn = "",int(resn)-1
    #         resn = int(resn)
            if resn < min_resn: 
                min_resn = resn
            if resn > max_resn: 
                max_resn = resn
            if resn not in xyz: 
                xyz[resn] = {}
            if resa not in xyz[resn]: 
                xyz[resn][resa] = {}
            if resn not in seq: 
                seq[resn] = {}
            if resa not in seq[resn]: 
                seq[resn][resa] = resi
    
            if atom not in xyz[resn][resa]:
              xyz[resn][resa][atom] = np.array([x,y,z])
    
      # convert to numpy arrays, fill in missing values
      seq_,xyz_ = [],[]
      try:
          for resn in range(min_resn,max_resn+1):
            if resn in seq:
              for k in sorted(seq[resn]): seq_.append(aa_3_N.get(seq[resn][k],20))
            else: seq_.append(20)
            if resn in xyz:
              for k in sorted(xyz[resn]):
                for atom in atoms:
                  if atom in xyz[resn][k]: xyz_.append(xyz[resn][k][atom])
                  else: xyz_.append(np.full(3,np.nan))
            else:
              for atom in atoms: xyz_.append(np.full(3,np.nan))
          return np.array(xyz_).reshape(-1,len(atoms),3), N_to_AA(np.array(seq_))
      except TypeError:
          return 'no_chain', 'no_chain'
    
    
    
    pdb_dict_list = []
    c = 0
    
    if folder_with_pdbs_path[-1]!='/':
        folder_with_pdbs_path = folder_with_pdbs_path+'/'
    
    
    init_alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G','H', 'I', 'J','K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T','U', 'V','W','X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g','h', 'i', 'j','k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't','u', 'v','w','x', 'y', 'z']
    extra_alphabet = [str(item) for item in list(np.arange(300))]
    chain_alphabet = init_alphabet + extra_alphabet
    
    biounit_names = glob.glob(folder_with_pdbs_path+'*.pdb')
    for biounit in biounit_names:
        my_dict = {}
        s = 0
        concat_seq = ''
        concat_N = []
        concat_CA = []
        concat_C = []
        concat_O = []
        concat_mask = []
        coords_dict = {}
        for letter in chain_alphabet:
            xyz, seq = parse_PDB_biounits(biounit, atoms=['N','CA','C','O'], chain=letter)
            if type(xyz) != str:
                concat_seq += seq[0]
                my_dict['seq_chain_'+letter]=seq[0]
                coords_dict_chain = {}
                coords_dict_chain['N_chain_'+letter]=xyz[:,0,:].tolist()
                coords_dict_chain['CA_chain_'+letter]=xyz[:,1,:].tolist()
                coords_dict_chain['C_chain_'+letter]=xyz[:,2,:].tolist()
                coords_dict_chain['O_chain_'+letter]=xyz[:,3,:].tolist()
                my_dict['coords_chain_'+letter]=coords_dict_chain
                s += 1
        fi = biounit.rfind("/")
        my_dict['name']=biounit[(fi+1):-4]
        my_dict['num_of_chains'] = s
        my_dict['seq'] = concat_seq
        if s < len(chain_alphabet):
            pdb_dict_list.append(my_dict)
            c+=1
            
            
    with open(save_path, 'w') as f:
        for entry in pdb_dict_list:
            f.write(json.dumps(entry) + '\n')
           

if __name__ == "__main__":
    argparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    argparser.add_argument("--input_path", type=str, help="Path to a folder with pdb files, e.g. /home/my_pdbs/")
    argparser.add_argument("--output_path", type=str, help="Path where to save .jsonl dictionary of parsed pdbs")

    args = argparser.parse_args()
    main(args)