import os from argparse import FileType, ArgumentParser import numpy as np from Bio.PDB import PDBParser from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from tqdm import tqdm parser = ArgumentParser() parser.add_argument('--data_dir', type=str, default='data/PDBBind_processed', help='') parser.add_argument('--chain_cutoff', type=int, default=10, help='') parser.add_argument('--out_file', type=str, default="data/pdbbind_sequences.fasta") args = parser.parse_args() cutoff = args.chain_cutoff data_dir = args.data_dir names = os.listdir(data_dir) #%% from Bio import SeqIO biopython_parser = PDBParser() three_to_one = {'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'MSE': 'M', # this is almost the same AA as MET. The sulfur is just replaced by Selen 'PHE': 'F', 'PRO': 'P', 'PYL': 'O', 'SER': 'S', 'SEC': 'U', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V', 'ASX': 'B', 'GLX': 'Z', 'XAA': 'X', 'XLE': 'J'} sequences = [] ids = [] for name in tqdm(names): if name == '.DS_Store': continue if os.path.exists(os.path.join(data_dir, name, f'{name}_protein_processed.pdb')): rec_path = os.path.join(data_dir, name, f'{name}_protein_processed.pdb') else: rec_path = os.path.join(data_dir, name, f'{name}_protein.pdb') if cutoff > 10: rec_path = os.path.join(data_dir, name, f'{name}_protein_obabel_reduce.pdb') if not os.path.exists(rec_path): rec_path = os.path.join(data_dir, name, f'{name}_protein.pdb') structure = biopython_parser.get_structure('random_id', rec_path) structure = structure[0] for i, chain in enumerate(structure): seq = '' for res_idx, residue in enumerate(chain): if residue.get_resname() == 'HOH': continue residue_coords = [] c_alpha, n, c = None, None, None for atom in residue: if atom.name == 'CA': c_alpha = list(atom.get_vector()) if atom.name == 'N': n = list(atom.get_vector()) if atom.name == 'C': c = list(atom.get_vector()) if c_alpha != None and n != None and c != None: # only append residue if it is an amino acid and not try: seq += three_to_one[residue.get_resname()] except Exception as e: seq += '-' print("encountered unknown AA: ", residue.get_resname(), ' in the complex ', name, '. Replacing it with a dash - .') sequences.append(seq) ids.append(f'{name}_chain_{i}') records = [] for (index, seq) in zip(ids,sequences): record = SeqRecord(Seq(seq), str(index)) record.description = '' records.append(record) SeqIO.write(records, args.out_file, "fasta")