import argparse import shutil import gradio as gr import numpy as np import os import torch import output from rdkit import Chem from src import const from src.datasets import ( get_dataloader, collate_with_fragment_edges, collate_with_fragment_without_pocket_edges, parse_molecule, MOADDataset ) from src.lightning import DDPM from src.linker_size_lightning import SizeClassifier from src.generation import generate_linkers, try_to_convert_to_sdf, get_pocket from zipfile import ZipFile MIN_N_STEPS = 100 MAX_N_STEPS = 500 MAX_BATCH_SIZE = 20 MODELS_METADATA = { 'geom_difflinker': { 'link': 'https://zenodo.org/record/7121300/files/geom_difflinker.ckpt?download=1', 'path': 'models/geom_difflinker.ckpt', }, 'geom_difflinker_given_anchors': { 'link': 'https://zenodo.org/record/7775568/files/geom_difflinker_given_anchors.ckpt?download=1', 'path': 'models/geom_difflinker_given_anchors.ckpt', }, 'pockets_difflinker': { # 'link': 'https://zenodo.org/record/7775568/files/pockets_difflinker_full_no_anchors.ckpt?download=1', # 'path': 'models/pockets_difflinker.ckpt', 'link': 'https://zenodo.org/records/10988017/files/pockets_difflinker_full_no_anchors_fc_pdb_excluded.ckpt?download=1', 'path': 'models/pockets_difflinker_full_no_anchors_fc_pdb_excluded.ckpt', }, 'pockets_difflinker_given_anchors': { # 'link': 'https://zenodo.org/record/7775568/files/pockets_difflinker_full.ckpt?download=1', # 'path': 'models/pockets_difflinker_given_anchors.ckpt', 'link': 'https://zenodo.org/records/10988017/files/pockets_difflinker_full_fc_pdb_excluded.ckpt?download=1', 'path': 'models/pockets_difflinker_full_fc_pdb_excluded.ckpt', }, } parser = argparse.ArgumentParser() parser.add_argument('--ip', type=str, default=None) args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f'Device: {device}') os.makedirs("results", exist_ok=True) size_gnn_path = 'models/geom_size_gnn.ckpt' size_nn = SizeClassifier.load_from_checkpoint('models/geom_size_gnn.ckpt', map_location=device).eval().to(device) print('Loaded SizeGNN model') diffusion_models = {} for model_name, metadata in MODELS_METADATA.items(): diffusion_path = metadata['path'] diffusion_models[model_name] = DDPM.load_from_checkpoint(diffusion_path, map_location=device).eval().to(device) print(f'Loaded model {model_name}') print(os.curdir) print(os.path.abspath(os.curdir)) print(os.listdir(os.curdir)) def read_molecule_content(path): with open(path, "r") as f: return "".join(f.readlines()) def read_molecule(path): if path.endswith('.pdb'): return Chem.MolFromPDBFile(path, sanitize=False, removeHs=True) elif path.endswith('.mol'): return Chem.MolFromMolFile(path, sanitize=False, removeHs=True) elif path.endswith('.mol2'): return Chem.MolFromMol2File(path, sanitize=False, removeHs=True) elif path.endswith('.sdf'): return Chem.SDMolSupplier(path, sanitize=False, removeHs=True)[0] raise Exception('Unknown file extension') def read_molecule_file(in_file, allowed_extentions): if isinstance(in_file, str): path = in_file else: path = in_file.name extension = path.split('.')[-1] if extension not in allowed_extentions: msg = output.INVALID_FORMAT_MSG.format(extension=extension) return None, None, msg try: mol = read_molecule(path) except Exception as e: e = str(e).replace('\'', '') msg = output.ERROR_FORMAT_MSG.format(message=e) return None, None, msg if extension == 'pdb': content = Chem.MolToPDBBlock(mol) elif extension in ['mol', 'mol2', 'sdf']: content = Chem.MolToMolBlock(mol, kekulize=False) extension = 'mol' else: raise NotImplementedError return content, extension, None def show_input(in_fragments, in_protein): vis = '' if in_fragments is not None and in_protein is None: vis = show_fragments(in_fragments) elif in_fragments is None and in_protein is not None: vis = show_target(in_protein) elif in_fragments is not None and in_protein is not None: vis = show_fragments_and_target(in_fragments, in_protein) return [vis, gr.Dropdown.update(choices=[], value=None, visible=False), None] def show_fragments(in_fragments): molecule, extension, html = read_molecule_file(in_fragments, allowed_extentions=['sdf', 'pdb', 'mol', 'mol2']) if molecule is not None: html = output.FRAGMENTS_RENDERING_TEMPLATE.format(molecule=molecule, fmt=extension) return output.IFRAME_TEMPLATE.format(html=html) def show_target(in_protein): molecule, extension, html = read_molecule_file(in_protein, allowed_extentions=['pdb']) if molecule is not None: html = output.TARGET_RENDERING_TEMPLATE.format(molecule=molecule, fmt=extension) return output.IFRAME_TEMPLATE.format(html=html) def show_fragments_and_target(in_fragments, in_protein): fragments_molecule, fragments_extension, msg = read_molecule_file(in_fragments, ['sdf', 'pdb', 'mol', 'mol2']) if fragments_molecule is None: return output.IFRAME_TEMPLATE.format(html=msg) target_molecule, target_extension, msg = read_molecule_file(in_protein, allowed_extentions=['pdb']) if fragments_molecule is None: return output.IFRAME_TEMPLATE.format(html=msg) html = output.FRAGMENTS_AND_TARGET_RENDERING_TEMPLATE.format( molecule=fragments_molecule, fmt=fragments_extension, target=target_molecule, target_fmt=target_extension, ) return output.IFRAME_TEMPLATE.format(html=html) def clear_fragments_input(in_protein): vis = '' if in_protein is not None: vis = show_target(in_protein) return [None, vis, gr.Dropdown.update(choices=[], value=None, visible=False), None] def clear_protein_input(in_fragments): vis = '' if in_fragments is not None: vis = show_fragments(in_fragments) return [None, vis, gr.Dropdown.update(choices=[], value=None, visible=False), None] def click_on_example(example): fragment_fname, target_fname = example fragment_path = f'examples/{fragment_fname}' if fragment_fname != '' else None target_path = f'examples/{target_fname}' if target_fname != '' else None return [fragment_path, target_path] + show_input(fragment_path, target_path) def draw_sample(sample_path, out_files, num_samples): with_protein = (len(out_files) == num_samples + 3) in_file = out_files[1] in_sdf = in_file if isinstance(in_file, str) else in_file.name input_fragments_content = read_molecule_content(in_sdf) fragments_fmt = in_sdf.split('.')[-1] offset = 2 input_target_content = None target_fmt = None if with_protein: offset += 1 in_pdb = out_files[2] if isinstance(out_files[2], str) else out_files[2].name input_target_content = read_molecule_content(in_pdb) target_fmt = in_pdb.split('.')[-1] out_sdf = sample_path if isinstance(sample_path, str) else sample_path.name generated_molecule_content = read_molecule_content(out_sdf) molecule_fmt = out_sdf.split('.')[-1] if with_protein: html = output.SAMPLES_WITH_TARGET_RENDERING_TEMPLATE.format( fragments=input_fragments_content, fragments_fmt=fragments_fmt, molecule=generated_molecule_content, molecule_fmt=molecule_fmt, target=input_target_content, target_fmt=target_fmt, ) else: html = output.SAMPLES_RENDERING_TEMPLATE.format( fragments=input_fragments_content, fragments_fmt=fragments_fmt, molecule=generated_molecule_content, molecule_fmt=molecule_fmt, ) return output.IFRAME_TEMPLATE.format(html=html) def compress(output_fnames, name): archive_path = f'results/all_files_{name}.zip' with ZipFile(archive_path, 'w') as archive: for fname in output_fnames: archive.write(fname) return archive_path def generate(in_fragments, in_protein, n_steps, n_atoms, num_samples, selected_atoms): if in_fragments is None: return [None, None, None, None] if in_protein is None: return generate_without_pocket(in_fragments, n_steps, n_atoms, num_samples, selected_atoms) else: return generate_with_pocket(in_fragments, in_protein, n_steps, n_atoms, num_samples, selected_atoms) def generate_without_pocket(input_file, n_steps, n_atoms, num_samples, selected_atoms): # Parsing selected atoms (javascript output) selected_atoms = selected_atoms.strip() if selected_atoms == '': selected_atoms = [] else: selected_atoms = list(map(int, selected_atoms.split(','))) # Selecting model if len(selected_atoms) == 0: selected_model_name = 'geom_difflinker' else: selected_model_name = 'geom_difflinker_given_anchors' print(f'Start generating with model {selected_model_name}, selected_atoms:', selected_atoms) ddpm = diffusion_models[selected_model_name] path = input_file.name extension = path.split('.')[-1] if extension not in ['sdf', 'pdb', 'mol', 'mol2']: msg = output.INVALID_FORMAT_MSG.format(extension=extension) return [output.IFRAME_TEMPLATE.format(html=msg), None, None, None] try: molecule = read_molecule(path) try: molecule = Chem.RemoveAllHs(molecule) except: pass name = '.'.join(path.split('/')[-1].split('.')[:-1]) inp_sdf = f'results/input_{name}.sdf' except Exception as e: e = str(e).replace('\'', '') error = f'Could not read the molecule: {e}' msg = output.ERROR_FORMAT_MSG.format(message=error) return [output.IFRAME_TEMPLATE.format(html=msg), None, None, None] if molecule.GetNumAtoms() > 100: error = f'Too large molecule: upper limit is 100 heavy atoms' msg = output.ERROR_FORMAT_MSG.format(message=error) return [output.IFRAME_TEMPLATE.format(html=msg), None, None, None] with Chem.SDWriter(inp_sdf) as w: w.SetKekulize(False) w.write(molecule) positions, one_hot, charges = parse_molecule(molecule, is_geom=True) anchors = np.zeros_like(charges) anchors[selected_atoms] = 1 fragment_mask = np.ones_like(charges) linker_mask = np.zeros_like(charges) print('Read and parsed molecule') dataset = [{ 'uuid': '0', 'name': '0', 'positions': torch.tensor(positions, dtype=const.TORCH_FLOAT, device=device), 'one_hot': torch.tensor(one_hot, dtype=const.TORCH_FLOAT, device=device), 'charges': torch.tensor(charges, dtype=const.TORCH_FLOAT, device=device), 'anchors': torch.tensor(anchors, dtype=const.TORCH_FLOAT, device=device), 'fragment_mask': torch.tensor(fragment_mask, dtype=const.TORCH_FLOAT, device=device), 'linker_mask': torch.tensor(linker_mask, dtype=const.TORCH_FLOAT, device=device), 'num_atoms': len(positions), }] * num_samples dataloader = get_dataloader(dataset, batch_size=num_samples, collate_fn=collate_with_fragment_edges) print('Created dataloader') ddpm.edm.T = n_steps if n_atoms == 0: def sample_fn(_data): out, _ = size_nn.forward(_data, return_loss=False) probabilities = torch.softmax(out, dim=1) distribution = torch.distributions.Categorical(probs=probabilities) samples = distribution.sample() sizes = [] for label in samples.detach().cpu().numpy(): sizes.append(size_nn.linker_id2size[label]) sizes = torch.tensor(sizes, device=samples.device, dtype=torch.long) return sizes else: def sample_fn(_data): return torch.ones(_data['positions'].shape[0], device=device, dtype=torch.long) * n_atoms for data in dataloader: try: generate_linkers(ddpm=ddpm, data=data, sample_fn=sample_fn, name=name, with_pocket=False) except Exception as e: e = str(e).replace('\'', '') error = f'Caught exception while generating linkers: {e}' msg = output.ERROR_FORMAT_MSG.format(message=error) return [output.IFRAME_TEMPLATE.format(html=msg), None, None, None] out_files = try_to_convert_to_sdf(name, num_samples) out_files = [inp_sdf] + out_files out_files = [compress(out_files, name=name)] + out_files choice = out_files[2] return [ draw_sample(choice, out_files, num_samples), out_files, gr.Dropdown.update( choices=out_files[2:], value=choice, visible=True, ), None ] def generate_with_pocket(in_fragments, in_protein, n_steps, n_atoms, num_samples, selected_atoms): # Parsing selected atoms (javascript output) selected_atoms = selected_atoms.strip() if selected_atoms == '': selected_atoms = [] else: selected_atoms = list(map(int, selected_atoms.split(','))) # Selecting model if len(selected_atoms) == 0: selected_model_name = 'pockets_difflinker' else: selected_model_name = 'pockets_difflinker_given_anchors' print(f'Start generating with model {selected_model_name}, selected_atoms:', selected_atoms) ddpm = diffusion_models[selected_model_name] fragments_path = in_fragments.name fragments_extension = fragments_path.split('.')[-1] if fragments_extension not in ['sdf', 'pdb', 'mol', 'mol2']: msg = output.INVALID_FORMAT_MSG.format(extension=fragments_extension) return [output.IFRAME_TEMPLATE.format(html=msg), None, None, None] protein_path = in_protein.name protein_extension = protein_path.split('.')[-1] if protein_extension not in ['pdb']: msg = output.INVALID_FORMAT_MSG.format(extension=protein_extension) return [output.IFRAME_TEMPLATE.format(html=msg), None, None, None] try: fragments_mol = read_molecule(fragments_path) name = '.'.join(fragments_path.split('/')[-1].split('.')[:-1]) except Exception as e: e = str(e).replace('\'', '') error = f'Could not read the molecule: {e}' msg = output.ERROR_FORMAT_MSG.format(message=error) return [output.IFRAME_TEMPLATE.format(html=msg), None, None, None] if fragments_mol.GetNumAtoms() > 100: error = f'Too large molecule: upper limit is 100 heavy atoms' msg = output.ERROR_FORMAT_MSG.format(message=error) return [output.IFRAME_TEMPLATE.format(html=msg), None, None, None] inp_sdf = f'results/input_{name}.sdf' with Chem.SDWriter(inp_sdf) as w: w.SetKekulize(False) w.write(fragments_mol) inp_pdb = f'results/target_{name}.pdb' shutil.copy(protein_path, inp_pdb) frag_pos, frag_one_hot, frag_charges = parse_molecule(fragments_mol, is_geom=True) pocket_pos, pocket_one_hot, pocket_charges = get_pocket(fragments_mol, protein_path) print(f'Detected pocket with {len(pocket_pos)} atoms') positions = np.concatenate([frag_pos, pocket_pos], axis=0) one_hot = np.concatenate([frag_one_hot, pocket_one_hot], axis=0) charges = np.concatenate([frag_charges, pocket_charges], axis=0) anchors = np.zeros_like(charges) anchors[selected_atoms] = 1 fragment_only_mask = np.concatenate([ np.ones_like(frag_charges), np.zeros_like(pocket_charges), ]) pocket_mask = np.concatenate([ np.zeros_like(frag_charges), np.ones_like(pocket_charges), ]) linker_mask = np.concatenate([ np.zeros_like(frag_charges), np.zeros_like(pocket_charges), ]) fragment_mask = np.concatenate([ np.ones_like(frag_charges), np.ones_like(pocket_charges), ]) print('Read and parsed molecule') dataset = [{ 'uuid': '0', 'name': '0', 'positions': torch.tensor(positions, dtype=const.TORCH_FLOAT, device=device), 'one_hot': torch.tensor(one_hot, dtype=const.TORCH_FLOAT, device=device), 'charges': torch.tensor(charges, dtype=const.TORCH_FLOAT, device=device), 'anchors': torch.tensor(anchors, dtype=const.TORCH_FLOAT, device=device), 'fragment_only_mask': torch.tensor(fragment_only_mask, dtype=const.TORCH_FLOAT, device=device), 'pocket_mask': torch.tensor(pocket_mask, dtype=const.TORCH_FLOAT, device=device), 'fragment_mask': torch.tensor(fragment_mask, dtype=const.TORCH_FLOAT, device=device), 'linker_mask': torch.tensor(linker_mask, dtype=const.TORCH_FLOAT, device=device), 'num_atoms': len(positions), }] * num_samples dataset = MOADDataset(data=dataset) ddpm.val_dataset = dataset batch_size = min(num_samples, MAX_BATCH_SIZE) dataloader = get_dataloader(dataset, batch_size=batch_size, collate_fn=collate_with_fragment_without_pocket_edges) print('Created dataloader') ddpm.edm.T = n_steps if n_atoms == 0: def sample_fn(_data): out, _ = size_nn.forward(_data, return_loss=False, with_pocket=True) probabilities = torch.softmax(out, dim=1) distribution = torch.distributions.Categorical(probs=probabilities) samples = distribution.sample() sizes = [] for label in samples.detach().cpu().numpy(): sizes.append(size_nn.linker_id2size[label]) sizes = torch.tensor(sizes, device=samples.device, dtype=torch.long) return sizes else: def sample_fn(_data): return torch.ones(_data['positions'].shape[0], device=device, dtype=torch.long) * n_atoms for batch_i, data in enumerate(dataloader): try: offset_idx = batch_i * batch_size generate_linkers( ddpm=ddpm, data=data, sample_fn=sample_fn, name=name, with_pocket=True, offset_idx=offset_idx, ) except Exception as e: e = str(e).replace('\'', '') error = f'Caught exception while generating linkers: {e}' msg = output.ERROR_FORMAT_MSG.format(message=error) return [output.IFRAME_TEMPLATE.format(html=msg), None, None, None] out_files = try_to_convert_to_sdf(name, num_samples) out_files = [inp_sdf, inp_pdb] + out_files out_files = [compress(out_files, name=name)] + out_files choice = out_files[3] return [ draw_sample(choice, out_files, num_samples), out_files, gr.Dropdown.update( choices=out_files[3:], value=choice, visible=True, ), None ] demo = gr.Blocks() with demo: gr.Markdown('# DiffLinker: Equivariant 3D-Conditional Diffusion Model for Molecular Linker Design') gr.Markdown( 'Given a set of disconnected fragments in 3D, ' 'DiffLinker places missing atoms in between and designs a molecule incorporating all the initial fragments. ' 'Our method can link an arbitrary number of fragments, requires no information on the attachment atoms ' 'and linker size, and can be conditioned on the protein pockets.' ) gr.Markdown( '[**[Paper]**](https://arxiv.org/abs/2210.05274) ' '[**[Code]**](https://github.com/igashov/DiffLinker)' ) with gr.Box(): with gr.Row(): with gr.Column(): gr.Markdown('## Input') gr.Markdown('Upload the file with 3D-coordinates of the input fragments in .pdb, .mol2 or .sdf format:') input_fragments_file = gr.File(file_count='single', label='Input Fragments') gr.Markdown('Upload the file of the target protein in .pdb format (optionally):') input_protein_file = gr.File(file_count='single', label='Target Protein (Optional)') n_steps = gr.Slider( minimum=MIN_N_STEPS, maximum=MAX_N_STEPS, label="Number of Denoising Steps", step=10 ) n_atoms = gr.Slider( minimum=0, maximum=20, label="Linker Size: DiffLinker will predict it if set to 0", step=1 ) n_samples = gr.Slider(minimum=5, maximum=50, label="Number of Samples", step=5) examples = gr.Dataset( components=[gr.File(visible=False), gr.File(visible=False)], samples=[ ['examples/example_1.sdf', ''], ['examples/example_2.sdf', ''], ['examples/3hz1_fragments.sdf', 'examples/3hz1_protein.pdb'], ['examples/5ou2_fragments.sdf', 'examples/5ou2_protein.pdb'], ], type='values', headers=['Input Fragments', 'Target Protein'], ) button = gr.Button('Generate Linker!') gr.Markdown('') gr.Markdown('## Output Files') gr.Markdown('Download files with the generated molecules here:') output_files = gr.File(file_count='multiple', label='Output Files', interactive=False) hidden = gr.Textbox(visible=False) with gr.Column(): gr.Markdown('## Visualization') gr.Markdown('**Hint:** click on atoms to select anchor points (optionally)') samples = gr.Dropdown( choices=[], value=None, type='value', multiselect=False, visible=False, interactive=True, label='Samples' ) visualization = gr.HTML() input_fragments_file.change( fn=show_input, inputs=[input_fragments_file, input_protein_file], outputs=[visualization, samples, hidden], ) input_protein_file.change( fn=show_input, inputs=[input_fragments_file, input_protein_file], outputs=[visualization, samples, hidden], ) input_fragments_file.clear( fn=clear_fragments_input, inputs=[input_protein_file], outputs=[input_fragments_file, visualization, samples, hidden], ) input_protein_file.clear( fn=clear_protein_input, inputs=[input_fragments_file], outputs=[input_protein_file, visualization, samples, hidden], ) examples.click( fn=click_on_example, inputs=[examples], outputs=[input_fragments_file, input_protein_file, visualization, samples, hidden] ) button.click( fn=generate, inputs=[input_fragments_file, input_protein_file, n_steps, n_atoms, n_samples, hidden], outputs=[visualization, output_files, samples, hidden], _js=output.RETURN_SELECTION_JS, ) samples.select( fn=draw_sample, inputs=[samples, output_files, n_samples], outputs=[visualization], ) demo.load(_js=output.STARTUP_JS) demo.launch(server_name=args.ip)