Spaces:

simonduerr
/

diffdock

Running on T4

File size: 2,920 Bytes

4a3f787

import os
from argparse import FileType, ArgumentParser

import numpy as np
from Bio.PDB import PDBParser
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from tqdm import tqdm

parser = ArgumentParser()
parser.add_argument('--data_dir', type=str, default='data/PDBBind_processed', help='')
parser.add_argument('--chain_cutoff', type=int, default=10, help='')
parser.add_argument('--out_file', type=str, default="data/pdbbind_sequences.fasta")
args = parser.parse_args()

cutoff = args.chain_cutoff
data_dir = args.data_dir
names = os.listdir(data_dir)
#%%
from Bio import SeqIO
biopython_parser = PDBParser()

three_to_one = {'ALA':	'A',
'ARG':	'R',
'ASN':	'N',
'ASP':	'D',
'CYS':	'C',
'GLN':	'Q',
'GLU':	'E',
'GLY':	'G',
'HIS':	'H',
'ILE':	'I',
'LEU':	'L',
'LYS':	'K',
'MET':	'M',
'MSE':  'M', # this is almost the same AA as MET. The sulfur is just replaced by Selen
'PHE':	'F',
'PRO':	'P',
'PYL':	'O',
'SER':	'S',
'SEC':	'U',
'THR':	'T',
'TRP':	'W',
'TYR':	'Y',
'VAL':	'V',
'ASX':	'B',
'GLX':	'Z',
'XAA':	'X',
'XLE':	'J'}

sequences = []
ids = []
for name in tqdm(names):
    if name == '.DS_Store': continue
    if os.path.exists(os.path.join(data_dir, name, f'{name}_protein_processed.pdb')):
        rec_path = os.path.join(data_dir, name, f'{name}_protein_processed.pdb')
    else:
        rec_path = os.path.join(data_dir, name, f'{name}_protein.pdb')
    if cutoff > 10:
        rec_path = os.path.join(data_dir, name, f'{name}_protein_obabel_reduce.pdb')
        if not os.path.exists(rec_path):
            rec_path = os.path.join(data_dir, name, f'{name}_protein.pdb')
    structure = biopython_parser.get_structure('random_id', rec_path)
    structure = structure[0]
    for i, chain in enumerate(structure):
        seq = ''
        for res_idx, residue in enumerate(chain):
            if residue.get_resname() == 'HOH':
                continue
            residue_coords = []
            c_alpha, n, c = None, None, None
            for atom in residue:
                if atom.name == 'CA':
                    c_alpha = list(atom.get_vector())
                if atom.name == 'N':
                    n = list(atom.get_vector())
                if atom.name == 'C':
                    c = list(atom.get_vector())
            if c_alpha != None and n != None and c != None:  # only append residue if it is an amino acid and not
                try:
                    seq += three_to_one[residue.get_resname()]
                except Exception as e:
                    seq += '-'
                    print("encountered unknown AA: ", residue.get_resname(), ' in the complex ', name, '. Replacing it with a dash - .')
        sequences.append(seq)
        ids.append(f'{name}_chain_{i}')
records = []
for (index, seq) in zip(ids,sequences):
    record = SeqRecord(Seq(seq), str(index))
    record.description = ''
    records.append(record)
SeqIO.write(records, args.out_file, "fasta")