Spaces:

simonduerr
/

diffdock

Sleeping

App Files Files Community

diffdock / datasets /pdbbind.py

simonduerr

Update datasets/pdbbind.py

1be0b1d 12 months ago

raw

history blame

26.8 kB

	import binascii
	import glob
	import hashlib
	import os
	import pickle
	from collections import defaultdict
	from multiprocessing import Pool
	import random
	import copy

	import numpy as np
	import torch
	from rdkit.Chem import MolToSmiles, MolFromSmiles, AddHs
	from torch_geometric.data import Dataset, HeteroData
	from torch_geometric.loader import DataLoader, DataListLoader
	from torch_geometric.transforms import BaseTransform
	from tqdm import tqdm

	from datasets.process_mols import (
	read_molecule,
	get_rec_graph,
	generate_conformer,
	get_lig_graph_with_matching,
	extract_receptor_structure,
	parse_receptor,
	parse_pdb_from_path,
	)
	from utils.diffusion_utils import modify_conformer, set_time
	from utils.utils import read_strings_from_txt
	from utils import so3, torus


	class NoiseTransform(BaseTransform):
	def __init__(self, t_to_sigma, no_torsion, all_atom):
	self.t_to_sigma = t_to_sigma
	self.no_torsion = no_torsion
	self.all_atom = all_atom

	def __call__(self, data):
	t = np.random.uniform()
	t_tr, t_rot, t_tor = t, t, t
	return self.apply_noise(data, t_tr, t_rot, t_tor)

	def apply_noise(
	self,
	data,
	t_tr,
	t_rot,
	t_tor,
	tr_update=None,
	rot_update=None,
	torsion_updates=None,
	):
	if not torch.is_tensor(data["ligand"].pos):
	data["ligand"].pos = random.choice(data["ligand"].pos)

	tr_sigma, rot_sigma, tor_sigma = self.t_to_sigma(t_tr, t_rot, t_tor)
	set_time(data, t_tr, t_rot, t_tor, 1, self.all_atom, device=None)

	tr_update = (
	torch.normal(mean=0, std=tr_sigma, size=(1, 3))
	if tr_update is None
	else tr_update
	)
	rot_update = so3.sample_vec(eps=rot_sigma) if rot_update is None else rot_update
	torsion_updates = (
	np.random.normal(
	loc=0.0, scale=tor_sigma, size=data["ligand"].edge_mask.sum()
	)
	if torsion_updates is None
	else torsion_updates
	)
	torsion_updates = None if self.no_torsion else torsion_updates
	modify_conformer(
	data, tr_update, torch.from_numpy(rot_update).float(), torsion_updates
	)

	data.tr_score = -tr_update / tr_sigma**2
	data.rot_score = (
	torch.from_numpy(so3.score_vec(vec=rot_update, eps=rot_sigma))
	.float()
	.unsqueeze(0)
	)
	data.tor_score = (
	None
	if self.no_torsion
	else torch.from_numpy(torus.score(torsion_updates, tor_sigma)).float()
	)
	data.tor_sigma_edge = (
	None
	if self.no_torsion
	else np.ones(data["ligand"].edge_mask.sum()) * tor_sigma
	)
	return data


	class PDBBind(Dataset):
	def __init__(
	self,
	root,
	transform=None,
	cache_path="data/cache",
	split_path="data/",
	limit_complexes=0,
	receptor_radius=30,
	num_workers=1,
	c_alpha_max_neighbors=None,
	popsize=15,
	maxiter=15,
	matching=True,
	keep_original=False,
	max_lig_size=None,
	remove_hs=False,
	num_conformers=1,
	all_atoms=False,
	atom_radius=5,
	atom_max_neighbors=None,
	esm_embeddings_path=None,
	require_ligand=False,
	ligands_list=None,
	protein_path_list=None,
	ligand_descriptions=None,
	keep_local_structures=False,
	):

	super(PDBBind, self).__init__(root, transform)
	self.pdbbind_dir = root
	self.max_lig_size = max_lig_size
	self.split_path = split_path
	self.limit_complexes = limit_complexes
	self.receptor_radius = receptor_radius
	self.num_workers = num_workers
	self.c_alpha_max_neighbors = c_alpha_max_neighbors
	self.remove_hs = remove_hs
	self.esm_embeddings_path = esm_embeddings_path
	self.require_ligand = require_ligand
	self.protein_path_list = protein_path_list
	self.ligand_descriptions = ligand_descriptions
	self.keep_local_structures = keep_local_structures
	if (
	matching
	or protein_path_list is not None
	and ligand_descriptions is not None
	):
	cache_path += "_torsion"
	if all_atoms:
	cache_path += "_allatoms"
	self.full_cache_path = os.path.join(
	cache_path,
	f"limit{self.limit_complexes}"
	f"_INDEX{os.path.splitext(os.path.basename(self.split_path))[0]}"
	f"_maxLigSize{self.max_lig_size}_H{int(not self.remove_hs)}"
	f"_recRad{self.receptor_radius}_recMax{self.c_alpha_max_neighbors}"
	+ (
	""
	if not all_atoms
	else f"_atomRad{atom_radius}_atomMax{atom_max_neighbors}"
	)
	+ ("" if not matching or num_conformers == 1 else f"_confs{num_conformers}")
	+ ("" if self.esm_embeddings_path is None else f"_esmEmbeddings")
	+ ("" if not keep_local_structures else f"_keptLocalStruct")
	+ (
	""
	if protein_path_list is None or ligand_descriptions is None
	else str(
	binascii.crc32(
	"".join(ligand_descriptions + protein_path_list).encode()
	)
	)
	),
	)
	self.popsize, self.maxiter = popsize, maxiter
	self.matching, self.keep_original = matching, keep_original
	self.num_conformers = num_conformers
	self.all_atoms = all_atoms
	self.atom_radius, self.atom_max_neighbors = atom_radius, atom_max_neighbors
	if not os.path.exists(
	os.path.join(self.full_cache_path, "heterographs.pkl")
	) or (
	require_ligand
	and not os.path.exists(
	os.path.join(self.full_cache_path, "rdkit_ligands.pkl")
	)
	):
	os.makedirs(self.full_cache_path, exist_ok=True)
	if protein_path_list is None or ligand_descriptions is None:
	self.preprocessing()
	else:
	self.inference_preprocessing()

	print(
	"loading data from memory: ",
	os.path.join(self.full_cache_path, "heterographs.pkl"),
	)
	with open(os.path.join(self.full_cache_path, "heterographs.pkl"), "rb") as f:
	self.complex_graphs = pickle.load(f)
	if require_ligand:
	with open(
	os.path.join(self.full_cache_path, "rdkit_ligands.pkl"), "rb"
	) as f:
	self.rdkit_ligands = pickle.load(f)

	print_statistics(self.complex_graphs)

	def len(self):
	return len(self.complex_graphs)

	def get(self, idx):
	if self.require_ligand:
	complex_graph = copy.deepcopy(self.complex_graphs[idx])
	complex_graph.mol = copy.deepcopy(self.rdkit_ligands[idx])
	return complex_graph
	else:
	return copy.deepcopy(self.complex_graphs[idx])

	def preprocessing(self):
	print(
	f"Processing complexes from [{self.split_path}] and saving it to [{self.full_cache_path}]"
	)

	complex_names_all = read_strings_from_txt(self.split_path)
	if self.limit_complexes is not None and self.limit_complexes != 0:
	complex_names_all = complex_names_all[: self.limit_complexes]
	print(f"Loading {len(complex_names_all)} complexes.")

	if self.esm_embeddings_path is not None:
	id_to_embeddings = torch.load(self.esm_embeddings_path)
	chain_embeddings_dictlist = defaultdict(list)
	for key, embedding in id_to_embeddings.items():
	key_name = key.split("_")[0]
	if key_name in complex_names_all:
	chain_embeddings_dictlist[key_name].append(embedding)
	lm_embeddings_chains_all = []
	for name in complex_names_all:
	lm_embeddings_chains_all.append(chain_embeddings_dictlist[name])
	else:
	lm_embeddings_chains_all = [None] * len(complex_names_all)

	if self.num_workers > 1:
	# running preprocessing in parallel on multiple workers and saving the progress every 1000 complexes
	for i in range(len(complex_names_all) // 1000 + 1):
	if os.path.exists(
	os.path.join(self.full_cache_path, f"heterographs{i}.pkl")
	):
	continue
	complex_names = complex_names_all[1000 * i : 1000 * (i + 1)]
	lm_embeddings_chains = lm_embeddings_chains_all[
	1000 * i : 1000 * (i + 1)
	]
	complex_graphs, rdkit_ligands = [], []
	if self.num_workers > 1:
	p = Pool(self.num_workers, maxtasksperchild=1)
	p.__enter__()
	with tqdm(
	total=len(complex_names),
	desc=f"loading complexes {i}/{len(complex_names_all)//1000+1}",
	) as pbar:
	map_fn = p.imap_unordered if self.num_workers > 1 else map
	for t in map_fn(
	self.get_complex,
	zip(
	complex_names,
	lm_embeddings_chains,
	[None] * len(complex_names),
	[None] * len(complex_names),
	),
	):
	complex_graphs.extend(t[0])
	rdkit_ligands.extend(t[1])
	pbar.update()
	if self.num_workers > 1:
	p.__exit__(None, None, None)

	with open(
	os.path.join(self.full_cache_path, f"heterographs{i}.pkl"), "wb"
	) as f:
	pickle.dump((complex_graphs), f)
	with open(
	os.path.join(self.full_cache_path, f"rdkit_ligands{i}.pkl"), "wb"
	) as f:
	pickle.dump((rdkit_ligands), f)

	complex_graphs_all = []
	for i in range(len(complex_names_all) // 1000 + 1):
	with open(
	os.path.join(self.full_cache_path, f"heterographs{i}.pkl"), "rb"
	) as f:
	l = pickle.load(f)
	complex_graphs_all.extend(l)
	with open(
	os.path.join(self.full_cache_path, f"heterographs.pkl"), "wb"
	) as f:
	pickle.dump((complex_graphs_all), f)

	rdkit_ligands_all = []
	for i in range(len(complex_names_all) // 1000 + 1):
	with open(
	os.path.join(self.full_cache_path, f"rdkit_ligands{i}.pkl"), "rb"
	) as f:
	l = pickle.load(f)
	rdkit_ligands_all.extend(l)
	with open(
	os.path.join(self.full_cache_path, f"rdkit_ligands.pkl"), "wb"
	) as f:
	pickle.dump((rdkit_ligands_all), f)
	else:
	complex_graphs, rdkit_ligands = [], []
	with tqdm(total=len(complex_names_all), desc="loading complexes") as pbar:
	for t in map(
	self.get_complex,
	zip(
	complex_names_all,
	lm_embeddings_chains_all,
	[None] * len(complex_names_all),
	[None] * len(complex_names_all),
	),
	):
	complex_graphs.extend(t[0])
	rdkit_ligands.extend(t[1])
	pbar.update()
	with open(
	os.path.join(self.full_cache_path, "heterographs.pkl"), "wb"
	) as f:
	pickle.dump((complex_graphs), f)
	with open(
	os.path.join(self.full_cache_path, "rdkit_ligands.pkl"), "wb"
	) as f:
	pickle.dump((rdkit_ligands), f)

	def inference_preprocessing(self):
	ligands_list = []
	print("Reading molecules and generating local structures with RDKit")
	for ligand_description in tqdm(self.ligand_descriptions):
	mol = MolFromSmiles(ligand_description) # check if it is a smiles or a path
	print(ligand_description, mol)
	if mol is not None:
	mol = AddHs(mol)
	generate_conformer(mol)
	ligands_list.append(mol)
	else:
	mol = read_molecule(ligand_description, remove_hs=False, sanitize=True)
	print(mol)
	if not self.keep_local_structures:
	mol.RemoveAllConformers()
	mol = AddHs(mol)
	generate_conformer(mol)
	ligands_list.append(mol)

	if self.esm_embeddings_path is not None:
	print("Reading language model embeddings.")
	lm_embeddings_chains_all = []
	if not os.path.exists(self.esm_embeddings_path):
	raise Exception(
	"ESM embeddings path does not exist: ", self.esm_embeddings_path
	)
	for protein_path in self.protein_path_list:
	embeddings_paths = sorted(
	glob.glob(
	os.path.join(
	self.esm_embeddings_path, os.path.basename(protein_path)
	)
	+ "*"
	)
	)
	lm_embeddings_chains = []
	for embeddings_path in embeddings_paths:
	lm_embeddings_chains.append(
	torch.load(embeddings_path)["representations"][33]
	)
	lm_embeddings_chains_all.append(lm_embeddings_chains)
	else:
	lm_embeddings_chains_all = [None] * len(self.protein_path_list)

	print("Generating graphs for ligands and proteins")
	if self.num_workers > 1:
	# running preprocessing in parallel on multiple workers and saving the progress every 1000 complexes
	for i in range(len(self.protein_path_list) // 1000 + 1):
	if os.path.exists(
	os.path.join(self.full_cache_path, f"heterographs{i}.pkl")
	):
	continue
	protein_paths_chunk = self.protein_path_list[1000 * i : 1000 * (i + 1)]
	ligand_description_chunk = self.ligand_descriptions[
	1000 * i : 1000 * (i + 1)
	]
	ligands_chunk = ligands_list[1000 * i : 1000 * (i + 1)]
	lm_embeddings_chains = lm_embeddings_chains_all[
	1000 * i : 1000 * (i + 1)
	]
	complex_graphs, rdkit_ligands = [], []
	if self.num_workers > 1:
	p = Pool(self.num_workers, maxtasksperchild=1)
	p.__enter__()
	with tqdm(
	total=len(protein_paths_chunk),
	desc=f"loading complexes {i}/{len(protein_paths_chunk)//1000+1}",
	) as pbar:
	map_fn = p.imap_unordered if self.num_workers > 1 else map
	for t in map_fn(
	self.get_complex,
	zip(
	protein_paths_chunk,
	lm_embeddings_chains,
	ligands_chunk,
	ligand_description_chunk,
	),
	):
	complex_graphs.extend(t[0])
	rdkit_ligands.extend(t[1])
	pbar.update()
	if self.num_workers > 1:
	p.__exit__(None, None, None)

	with open(
	os.path.join(self.full_cache_path, f"heterographs{i}.pkl"), "wb"
	) as f:
	pickle.dump((complex_graphs), f)
	with open(
	os.path.join(self.full_cache_path, f"rdkit_ligands{i}.pkl"), "wb"
	) as f:
	pickle.dump((rdkit_ligands), f)

	complex_graphs_all = []
	for i in range(len(self.protein_path_list) // 1000 + 1):
	with open(
	os.path.join(self.full_cache_path, f"heterographs{i}.pkl"), "rb"
	) as f:
	l = pickle.load(f)
	complex_graphs_all.extend(l)
	with open(
	os.path.join(self.full_cache_path, f"heterographs.pkl"), "wb"
	) as f:
	pickle.dump((complex_graphs_all), f)

	rdkit_ligands_all = []
	for i in range(len(self.protein_path_list) // 1000 + 1):
	with open(
	os.path.join(self.full_cache_path, f"rdkit_ligands{i}.pkl"), "rb"
	) as f:
	l = pickle.load(f)
	rdkit_ligands_all.extend(l)
	with open(
	os.path.join(self.full_cache_path, f"rdkit_ligands.pkl"), "wb"
	) as f:
	pickle.dump((rdkit_ligands_all), f)
	else:
	complex_graphs, rdkit_ligands = [], []
	with tqdm(
	total=len(self.protein_path_list), desc="loading complexes"
	) as pbar:
	for t in map(
	self.get_complex,
	zip(
	self.protein_path_list,
	lm_embeddings_chains_all,
	ligands_list,
	self.ligand_descriptions,
	),
	):
	complex_graphs.extend(t[0])
	rdkit_ligands.extend(t[1])
	pbar.update()
	with open(
	os.path.join(self.full_cache_path, "heterographs.pkl"), "wb"
	) as f:
	pickle.dump((complex_graphs), f)
	with open(
	os.path.join(self.full_cache_path, "rdkit_ligands.pkl"), "wb"
	) as f:
	pickle.dump((rdkit_ligands), f)

	def get_complex(self, par):
	name, lm_embedding_chains, ligand, ligand_description = par
	if not os.path.exists(os.path.join(self.pdbbind_dir, name)) and ligand is None:
	print("Folder not found", name)
	return [], []

	if ligand is not None:
	rec_model = parse_pdb_from_path(name)
	name = f"{name}____{ligand_description}"
	ligs = [ligand]
	else:
	try:
	rec_model = parse_receptor(name, self.pdbbind_dir)
	except Exception as e:
	print(f"Skipping {name} because of the error:")
	print(e)
	return [], []

	ligs = read_mols(self.pdbbind_dir, name, remove_hs=False)
	complex_graphs = []
	for i, lig in enumerate(ligs):
	if (
	self.max_lig_size is not None
	and lig.GetNumHeavyAtoms() > self.max_lig_size
	):
	print(
	f"Ligand with {lig.GetNumHeavyAtoms()} heavy atoms is larger than max_lig_size {self.max_lig_size}. Not including {name} in preprocessed data."
	)
	continue
	complex_graph = HeteroData()
	complex_graph["name"] = name
	try:
	get_lig_graph_with_matching(
	lig,
	complex_graph,
	self.popsize,
	self.maxiter,
	self.matching,
	self.keep_original,
	self.num_conformers,
	remove_hs=self.remove_hs,
	)
	print(lm_embedding_chains)
	(
	rec,
	rec_coords,
	c_alpha_coords,
	n_coords,
	c_coords,
	lm_embeddings,
	) = extract_receptor_structure(
	copy.deepcopy(rec_model),
	lig,
	lm_embedding_chains=lm_embedding_chains,
	)
	if lm_embeddings is not None and len(c_alpha_coords) != len(
	lm_embeddings
	):
	print(
	f"LM embeddings for complex {name} did not have the right length for the protein. Skipping {name}."
	)
	continue

	get_rec_graph(
	rec,
	rec_coords,
	c_alpha_coords,
	n_coords,
	c_coords,
	complex_graph,
	rec_radius=self.receptor_radius,
	c_alpha_max_neighbors=self.c_alpha_max_neighbors,
	all_atoms=self.all_atoms,
	atom_radius=self.atom_radius,
	atom_max_neighbors=self.atom_max_neighbors,
	remove_hs=self.remove_hs,
	lm_embeddings=lm_embeddings,
	)

	except Exception as e:
	print(f"Skipping {name} because of the error:")
	print(e)
	raise e
	continue

	protein_center = torch.mean(
	complex_graph["receptor"].pos, dim=0, keepdim=True
	)
	complex_graph["receptor"].pos -= protein_center
	if self.all_atoms:
	complex_graph["atom"].pos -= protein_center

	if (not self.matching) or self.num_conformers == 1:
	complex_graph["ligand"].pos -= protein_center
	else:
	for p in complex_graph["ligand"].pos:
	p -= protein_center

	complex_graph.original_center = protein_center
	complex_graphs.append(complex_graph)
	return complex_graphs, ligs


	def print_statistics(complex_graphs):
	statistics = ([], [], [], [])

	for complex_graph in complex_graphs:
	lig_pos = (
	complex_graph["ligand"].pos
	if torch.is_tensor(complex_graph["ligand"].pos)
	else complex_graph["ligand"].pos[0]
	)
	radius_protein = torch.max(
	torch.linalg.vector_norm(complex_graph["receptor"].pos, dim=1)
	)
	molecule_center = torch.mean(lig_pos, dim=0)
	radius_molecule = torch.max(
	torch.linalg.vector_norm(lig_pos - molecule_center.unsqueeze(0), dim=1)
	)
	distance_center = torch.linalg.vector_norm(molecule_center)
	statistics[0].append(radius_protein)
	statistics[1].append(radius_molecule)
	statistics[2].append(distance_center)
	if "rmsd_matching" in complex_graph:
	statistics[3].append(complex_graph.rmsd_matching)
	else:
	statistics[3].append(0)

	name = [
	"radius protein",
	"radius molecule",
	"distance protein-mol",
	"rmsd matching",
	]
	print("Number of complexes: ", len(complex_graphs))
	for i in range(4):
	array = np.asarray(statistics[i])
	print(
	f"{name[i]}: mean {np.mean(array)}, std {np.std(array)}, max {np.max(array)}"
	)


	def construct_loader(args, t_to_sigma):
	transform = NoiseTransform(
	t_to_sigma=t_to_sigma, no_torsion=args.no_torsion, all_atom=args.all_atoms
	)

	common_args = {
	"transform": transform,
	"root": args.data_dir,
	"limit_complexes": args.limit_complexes,
	"receptor_radius": args.receptor_radius,
	"c_alpha_max_neighbors": args.c_alpha_max_neighbors,
	"remove_hs": args.remove_hs,
	"max_lig_size": args.max_lig_size,
	"matching": not args.no_torsion,
	"popsize": args.matching_popsize,
	"maxiter": args.matching_maxiter,
	"num_workers": args.num_workers,
	"all_atoms": args.all_atoms,
	"atom_radius": args.atom_radius,
	"atom_max_neighbors": args.atom_max_neighbors,
	"esm_embeddings_path": args.esm_embeddings_path,
	}

	train_dataset = PDBBind(
	cache_path=args.cache_path,
	split_path=args.split_train,
	keep_original=True,
	num_conformers=args.num_conformers,
	**common_args,
	)
	val_dataset = PDBBind(
	cache_path=args.cache_path,
	split_path=args.split_val,
	keep_original=True,
	**common_args,
	)

	loader_class = DataListLoader if torch.cuda.is_available() else DataLoader
	train_loader = loader_class(
	dataset=train_dataset,
	batch_size=args.batch_size,
	num_workers=args.num_dataloader_workers,
	shuffle=True,
	pin_memory=args.pin_memory,
	)
	val_loader = loader_class(
	dataset=val_dataset,
	batch_size=args.batch_size,
	num_workers=args.num_dataloader_workers,
	shuffle=True,
	pin_memory=args.pin_memory,
	)

	return train_loader, val_loader


	def read_mol(pdbbind_dir, name, remove_hs=False):
	lig = read_molecule(
	os.path.join(pdbbind_dir, name, f"{name}_ligand.sdf"),
	remove_hs=remove_hs,
	sanitize=True,
	)
	if lig is None: # read mol2 file if sdf file cannot be sanitized
	lig = read_molecule(
	os.path.join(pdbbind_dir, name, f"{name}_ligand.mol2"),
	remove_hs=remove_hs,
	sanitize=True,
	)
	return lig


	def read_mols(pdbbind_dir, name, remove_hs=False):
	ligs = []
	for file in os.listdir(os.path.join(pdbbind_dir, name)):
	if file.endswith(".sdf") and "rdkit" not in file:
	lig = read_molecule(
	os.path.join(pdbbind_dir, name, file),
	remove_hs=remove_hs,
	sanitize=True,
	)
	if lig is None and os.path.exists(
	os.path.join(pdbbind_dir, name, file[:-4] + ".mol2")
	): # read mol2 file if sdf file cannot be sanitized
	print(
	"Using the .sdf file failed. We found a .mol2 file instead and are trying to use that."
	)
	lig = read_molecule(
	os.path.join(pdbbind_dir, name, file[:-4] + ".mol2"),
	remove_hs=remove_hs,
	sanitize=True,
	)
	if lig is not None:
	ligs.append(lig)
	return ligs