Spaces:

simonduerr
/

diffdock

Sleeping

App Files Files Community

diffdock / datasets /pdbbind_lm_embedding_preparation.py

gcorso

first commit

4a3f787 about 2 years ago

raw

history blame

2.92 kB

	import os
	from argparse import FileType, ArgumentParser

	import numpy as np
	from Bio.PDB import PDBParser
	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord
	from tqdm import tqdm

	parser = ArgumentParser()
	parser.add_argument('--data_dir', type=str, default='data/PDBBind_processed', help='')
	parser.add_argument('--chain_cutoff', type=int, default=10, help='')
	parser.add_argument('--out_file', type=str, default="data/pdbbind_sequences.fasta")
	args = parser.parse_args()

	cutoff = args.chain_cutoff
	data_dir = args.data_dir
	names = os.listdir(data_dir)
	#%%
	from Bio import SeqIO
	biopython_parser = PDBParser()

	three_to_one = {'ALA': 'A',
	'ARG': 'R',
	'ASN': 'N',
	'ASP': 'D',
	'CYS': 'C',
	'GLN': 'Q',
	'GLU': 'E',
	'GLY': 'G',
	'HIS': 'H',
	'ILE': 'I',
	'LEU': 'L',
	'LYS': 'K',
	'MET': 'M',
	'MSE': 'M', # this is almost the same AA as MET. The sulfur is just replaced by Selen
	'PHE': 'F',
	'PRO': 'P',
	'PYL': 'O',
	'SER': 'S',
	'SEC': 'U',
	'THR': 'T',
	'TRP': 'W',
	'TYR': 'Y',
	'VAL': 'V',
	'ASX': 'B',
	'GLX': 'Z',
	'XAA': 'X',
	'XLE': 'J'}

	sequences = []
	ids = []
	for name in tqdm(names):
	if name == '.DS_Store': continue
	if os.path.exists(os.path.join(data_dir, name, f'{name}_protein_processed.pdb')):
	rec_path = os.path.join(data_dir, name, f'{name}_protein_processed.pdb')
	else:
	rec_path = os.path.join(data_dir, name, f'{name}_protein.pdb')
	if cutoff > 10:
	rec_path = os.path.join(data_dir, name, f'{name}_protein_obabel_reduce.pdb')
	if not os.path.exists(rec_path):
	rec_path = os.path.join(data_dir, name, f'{name}_protein.pdb')
	structure = biopython_parser.get_structure('random_id', rec_path)
	structure = structure[0]
	for i, chain in enumerate(structure):
	seq = ''
	for res_idx, residue in enumerate(chain):
	if residue.get_resname() == 'HOH':
	continue
	residue_coords = []
	c_alpha, n, c = None, None, None
	for atom in residue:
	if atom.name == 'CA':
	c_alpha = list(atom.get_vector())
	if atom.name == 'N':
	n = list(atom.get_vector())
	if atom.name == 'C':
	c = list(atom.get_vector())
	if c_alpha != None and n != None and c != None: # only append residue if it is an amino acid and not
	try:
	seq += three_to_one[residue.get_resname()]
	except Exception as e:
	seq += '-'
	print("encountered unknown AA: ", residue.get_resname(), ' in the complex ', name, '. Replacing it with a dash - .')
	sequences.append(seq)
	ids.append(f'{name}_chain_{i}')
	records = []
	for (index, seq) in zip(ids,sequences):
	record = SeqRecord(Seq(seq), str(index))
	record.description = ''
	records.append(record)
	SeqIO.write(records, args.out_file, "fasta")