Spaces:

Zaixi
/

ICLR_FLAG

Runtime error

App Files Files Community

zaixizhang commited on Oct 24, 2023

Commit

10efe81

1 Parent(s): 465c18c

renew

Browse files

Files changed (38) hide show

.gitattributes +1 -0
app.py +48 -4
checkpoints/pretrained.pt +3 -0
configs/sample.yml +18 -0
data/index.pt +3 -0
data/pdbbind_pocket10_name2id.pt +3 -0
data/pdbbind_pocket10_processed.lmdb +3 -0
data/pdbbind_pocket10_processed.lmdb-lock +0 -0
data/split_by_name.pt +3 -0
evaluation/prepare_receptor4.py +183 -0
evaluation/vina_score.py +35 -0
models/common.py +282 -0
models/encoders/__init__.py +26 -0
models/encoders/gnn.py +441 -0
models/encoders/schnet.py +105 -0
models/encoders/tf.py +152 -0
models/flag.py +268 -0
motif_sample.py +660 -0
requirements.txt +390 -0
utils/__init__.py +3 -0
utils/chem.py +119 -0
utils/chemutils.py +597 -0
utils/data.py +127 -0
utils/datasets/__init__.py +21 -0
utils/datasets/pl.py +176 -0
utils/dihedral_utils.py +383 -0
utils/docking.py +183 -0
utils/fpscores.pkl.gz +3 -0
utils/misc.py +78 -0
utils/mol_tree.py +220 -0
utils/protein_ligand.py +283 -0
utils/reconstruct.py +498 -0
utils/sascorer.py +163 -0
utils/similarity.py +20 -0
utils/train.py +102 -0
utils/transforms.py +684 -0
utils/warmup.py +86 -0
vocab.txt +549 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.lmdb filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,7 +1,51 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

 import gradio as gr
+import os
+from rdkit import Chem
+from motif_sample import demo
+import py3Dmol
+import tempfile
+from rdkit.Chem import AllChem
+import numpy as np
+from PIL import Image
+import io
+from rdkit.Chem import Draw
+# Function to serve the file via Gradio
+def create_and_return_sdf(Protein_index: int):
+    # Ensure input is an integer, as it's coming from the interface.
+    number = Protein_index
+    number = int(number)
+    # Generate SDF file (you'll replace this with your actual logic)
+    sdf_filename = demo(number)
+    suppl = Chem.SDMolSupplier(sdf_filename)
+    mol = next(suppl)
+    # AllChem.UFFOptimizeMolecule(mol)
+    mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol))
+    for atom in mol.GetAtoms():
+        atom.SetAtomMapNum(0)
+    mol_image = Draw.MolToImage(mol)
+    np_image = np.array(mol_image)
+    np_image = np_image[:, :, :3]
+    return np_image, sdf_filename
+# Define Gradio interface
+iface = gr.Interface(
+    fn=create_and_return_sdf,
+    inputs="text",
+    outputs=[
+        gr.outputs.Image(type="numpy", label="Molecule Image"),
+        gr.outputs.File(label="Download SDF")
+    ],
+    live=False  # The function should only be called when the user submits the form
+)
+# Launch the interface
+iface.launch(share=True)

checkpoints/pretrained.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a61f9bee0a6ce3101d8df5b55d71831d8428c4d6ff82ab81ada8bb5277babd42
+size 44147405

configs/sample.yml ADDED Viewed

	@@ -0,0 +1,18 @@

+dataset:
+  name: pl
+  path: ./data
+  split: ./data/split_by_name.pt
+model:
+  checkpoint: ./checkpoints/pretrained.pt
+  hidden_channels: 256
+  random_alpha: False
+sample:
+  seed: 2024
+  num_samples: 100
+  num_retry: 5
+  max_steps: 12
+  batch_size: 10
+  num_workers: 4
+  n_samples: 5

data/index.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c095461a584f03838af99d6411483040f641659d1521dc5247bcefd72e36ca8e
+size 226859

data/pdbbind_pocket10_name2id.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0aa8a46e2cd77abb5b59221ece57e607ba30427d964d9d5b08c0d02e3449399c
+size 237265

data/pdbbind_pocket10_processed.lmdb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a21c6a2231e3394d3c8e27b1e547a9376f4ee0bdc1c246fbbf32a8fc076710eb
+size 386912256

data/pdbbind_pocket10_processed.lmdb-lock ADDED Viewed

Binary file (8.19 kB). View file

data/split_by_name.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d406b7bd5189c35cbe73943b5c8e0d61ae1fdc79b40d24dbb45c474915892b56
+size 227451

evaluation/prepare_receptor4.py ADDED Viewed

	@@ -0,0 +1,183 @@

+#!/usr/bin/env python
+#
+#
+#
+# $Header: /opt/cvs/python/packages/share1.5/AutoDockTools/Utilities24/prepare_receptor4.py,v 1.11 2007/11/28 22:40:22 rhuey Exp $
+#
+import os
+from MolKit import Read
+import MolKit.molecule
+import MolKit.protein
+from AutoDockTools.MoleculePreparation import AD4ReceptorPreparation
+if __name__ == '__main__':
+    import sys
+    import getopt
+    def usage():
+        "Print helpful, accurate usage statement to stdout."
+        print "Usage: prepare_receptor4.py -r filename"
+        print
+        print "    Description of command..."
+        print "         -r   receptor_filename "
+        print "        supported file types include pdb,mol2,pdbq,pdbqs,pdbqt, possibly pqr,cif"
+        print "    Optional parameters:"
+        print "        [-v]  verbose output (default is minimal output)"
+        print "        [-o pdbqt_filename]  (default is 'molecule_name.pdbqt')"
+        print "        [-A]  type(s) of repairs to make: "
+        print "             'bonds_hydrogens': build bonds and add hydrogens "
+        print "             'bonds': build a single bond from each atom with no bonds to its closest neighbor"
+        print "             'hydrogens': add hydrogens"
+        print "             'checkhydrogens': add hydrogens only if there are none already"
+        print "             'None': do not make any repairs "
+        print "             (default is 'checkhydrogens')"
+        print "        [-C]  preserve all input charges ie do not add new charges "
+        print "             (default is addition of gasteiger charges)"
+        print "        [-p]  preserve input charges on specific atom types, eg -p Zn -p Fe"
+        print "        [-U]  cleanup type:"
+        print "             'nphs': merge charges and remove non-polar hydrogens"
+        print "             'lps': merge charges and remove lone pairs"
+        print "             'waters': remove water residues"
+        print "             'nonstdres': remove chains composed entirely of residues of"
+        print "                      types other than the standard 20 amino acids"
+        print "             'deleteAltB': remove XX@B atoms and rename XX@A atoms->XX"
+        print "             (default is 'nphs_lps_waters_nonstdres') "
+        print "        [-e]  delete every nonstd residue from any chain"
+        print "              'True': any residue whose name is not in this list:"
+        print "                      ['CYS','ILE','SER','VAL','GLN','LYS','ASN', "
+        print "                      'PRO','THR','PHE','ALA','HIS','GLY','ASP', "
+        print "                      'LEU', 'ARG', 'TRP', 'GLU', 'TYR','MET', "
+        print "                      'HID', 'HSP', 'HIE', 'HIP', 'CYX', 'CSS']"
+        print "              will be deleted from any chain. "
+        print "              NB: there are no  nucleic acid residue names at all "
+        print "              in the list and no metals. "
+        print "             (default is False which means not to do this)"
+        print "        [-M]  interactive "
+        print "             (default is 'automatic': outputfile is written with no further user input)"
+    # process command arguments
+    try:
+        opt_list, args = getopt.getopt(sys.argv[1:], 'r:vo:A:Cp:U:eM:')
+    except getopt.GetoptError, msg:
+        print 'prepare_receptor4.py: %s' %msg
+        usage()
+        sys.exit(2)
+    # initialize required parameters
+    #-s: receptor
+    receptor_filename =  None
+    # optional parameters
+    verbose = None
+    #-A: repairs to make: add bonds and/or hydrogens or checkhydrogens
+    repairs = ''
+    #-C default: add gasteiger charges
+    charges_to_add = 'gasteiger'
+    #-p preserve charges on specific atom types
+    preserve_charge_types=None
+    #-U: cleanup by merging nphs_lps, nphs, lps, waters, nonstdres
+    cleanup  = "nphs_lps_waters_nonstdres"
+    #-o outputfilename
+    outputfilename = None
+    #-m mode
+    mode = 'automatic'
+    #-e delete every nonstd residue from each chain
+    delete_single_nonstd_residues = None
+    #'r:vo:A:Cp:U:eMh'
+    for o, a in opt_list:
+        if o in ('-r', '--r'):
+            receptor_filename = a
+            if verbose: print 'set receptor_filename to ', a
+        if o in ('-v', '--v'):
+            verbose = True
+            if verbose: print 'set verbose to ', True
+        if o in ('-o', '--o'):
+            outputfilename = a
+            if verbose: print 'set outputfilename to ', a
+        if o in ('-A', '--A'):
+            repairs = a
+            if verbose: print 'set repairs to ', a
+        if o in ('-C', '--C'):
+            charges_to_add = None
+            if verbose: print 'do not add charges'
+        if o in ('-p', '--p'):
+            if not preserve_charge_types:
+                preserve_charge_types = a
+            else:
+                preserve_charge_types = preserve_charge_types + ','+ a
+            if verbose: print 'preserve initial charges on ', preserve_charge_types
+        if o in ('-U', '--U'):
+            cleanup  = a
+            if verbose: print 'set cleanup to ', a
+        if o in ('-e', '--e'):
+            delete_single_nonstd_residues  = True
+            if verbose: print 'set delete_single_nonstd_residues to True'
+        if o in ('-M', '--M'):
+            mode = a
+            if verbose: print 'set mode to ', a
+        if o in ('-h', '--'):
+            usage()
+            sys.exit()
+    if not receptor_filename:
+        print 'prepare_receptor4: receptor filename must be specified.'
+        usage()
+        sys.exit()
+    #what about nucleic acids???
+    mols = Read(receptor_filename)
+    if verbose: print 'read ', receptor_filename
+    mol = mols[0]
+    preserved = {}
+    if charges_to_add is not None and preserve_charge_types is not None:
+        preserved_types = preserve_charge_types.split(',')
+        if verbose: print "preserved_types=", preserved_types
+        for t in preserved_types:
+            if verbose: print 'preserving charges on type->', t
+            if not len(t): continue
+            ats = mol.allAtoms.get(lambda x: x.autodock_element==t)
+            if verbose: print "preserving charges on ", ats.name
+            for a in ats:
+                if a.chargeSet is not None:
+                    preserved[a] = [a.chargeSet, a.charge]
+    if len(mols)>1:
+        if verbose: print "more than one molecule in file"
+        #use the molecule with the most atoms
+        ctr = 1
+        for m in mols[1:]:
+            ctr += 1
+            if len(m.allAtoms)>len(mol.allAtoms):
+                mol = m
+                if verbose: print "mol set to ", ctr, "th molecule with", len(mol.allAtoms), "atoms"
+    mol.buildBondsByDistance()
+    if verbose:
+        print "setting up RPO with mode=", mode,
+        print "and outputfilename= ", outputfilename
+        print "charges_to_add=", charges_to_add
+        print "delete_single_nonstd_residues=", delete_single_nonstd_residues
+    RPO = AD4ReceptorPreparation(mol, mode, repairs, charges_to_add,
+                        cleanup, outputfilename=outputfilename,
+                        preserved=preserved,
+                        delete_single_nonstd_residues=delete_single_nonstd_residues)
+    if charges_to_add is not None:
+        #restore any previous charges
+        for atom, chargeList in preserved.items():
+            atom._charges[chargeList[0]] = chargeList[1]
+            atom.chargeSet = chargeList[0]
+# To execute this command type:
+# prepare_receptor4.py -r pdb_file -o outputfilename -A checkhydrogens

evaluation/vina_score.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from vina import Vina
+from rdkit.Chem.rdForceFieldHelpers import UFFOptimizeMolecule
+from rdkit import Chem
+import numpy as np
+import os
+for i in range(100):
+    path = './' + str(i) + '.sdf'
+    if os.path.exists(path):
+        print(path)
+        v = Vina(sf_name='vina')
+        v.set_receptor('2rma_protein.pdbqt')
+        v.set_ligand_from_file('2rma_ligand'+'.pdbqt')
+        # Calculate the docking center
+        mol = Chem.MolFromMolFile(path, sanitize=True)
+        mol = Chem.AddHs(mol, addCoords=True)
+        UFFOptimizeMolecule(mol)
+        pos = mol.GetConformer(0).GetPositions()
+        center = np.mean(pos, 0)
+        v.compute_vina_maps(center=center, box_size=[20, 20, 20])
+        # Score the current pose
+        energy = v.score()
+        print('Score before minimization: %.3f (kcal/mol)' % energy[0])
+        # Minimized locally the current pose
+        energy_minimized = v.optimize()
+        print('Score after minimization : %.3f (kcal/mol)' % energy_minimized[0])
+        v.write_pose('ligand_minimized.pdbqt', overwrite=True)
+        # Dock the ligand
+        v.dock(exhaustiveness=64, n_poses=30)
+        v.write_poses('out.pdbqt', n_poses=5, overwrite=True)

models/common.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.loss import _WeightedLoss
+from torch_scatter import scatter_mean, scatter_add
+def split_tensor_by_batch(x, batch, num_graphs=None):
+    """
+    Args:
+        x:      (N, ...)
+        batch:  (B, )
+    Returns:
+        [(N_1, ), (N_2, ) ..., (N_B, ))]
+    """
+    if num_graphs is None:
+        num_graphs = batch.max().item() + 1
+    x_split = []
+    for i in range (num_graphs):
+        mask = batch == i
+        x_split.append(x[mask])
+    return x_split
+def concat_tensors_to_batch(x_split):
+    x = torch.cat(x_split, dim=0)
+    batch = torch.repeat_interleave(
+        torch.arange(len(x_split)),
+        repeats=torch.LongTensor([s.size(0) for s in x_split])
+    ).to(device=x.device)
+    return x, batch
+def split_tensor_to_segments(x, segsize):
+    num_segs = math.ceil(x.size(0) / segsize)
+    segs = []
+    for i in range(num_segs):
+        segs.append(x[i*segsize : (i+1)*segsize])
+    return segs
+def split_tensor_by_lengths(x, lengths):
+    segs = []
+    for l in lengths:
+        segs.append(x[:l])
+        x = x[l:]
+    return segs
+def batch_intersection_mask(batch, batch_filter):
+    batch_filter = batch_filter.unique()
+    mask = (batch.view(-1, 1) == batch_filter.view(1, -1)).any(dim=1)
+    return mask
+class MeanReadout(nn.Module):
+    """Mean readout operator over graphs with variadic sizes."""
+    def forward(self, input, batch, num_graphs):
+        """
+        Perform readout over the graph(s).
+        Parameters:
+            data (torch_geometric.data.Data): batched graph
+            input (Tensor): node representations
+        Returns:
+            Tensor: graph representations
+        """
+        output = scatter_mean(input, batch, dim=0, dim_size=num_graphs)
+        return output
+class SumReadout(nn.Module):
+    """Sum readout operator over graphs with variadic sizes."""
+    def forward(self, input, batch, num_graphs):
+        """
+        Perform readout over the graph(s).
+        Parameters:
+            data (torch_geometric.data.Data): batched graph
+            input (Tensor): node representations
+        Returns:
+            Tensor: graph representations
+        """
+        output = scatter_add(input, batch, dim=0, dim_size=num_graphs)
+        return output
+class MultiLayerPerceptron(nn.Module):
+    """
+    Multi-layer Perceptron.
+    Note there is no activation or dropout in the last layer.
+    Parameters:
+        input_dim (int): input dimension
+        hidden_dim (list of int): hidden dimensions
+        activation (str or function, optional): activation function
+        dropout (float, optional): dropout rate
+    """
+    def __init__(self, input_dim, hidden_dims, activation="relu", dropout=0):
+        super(MultiLayerPerceptron, self).__init__()
+        self.dims = [input_dim] + hidden_dims
+        if isinstance(activation, str):
+            self.activation = getattr(F, activation)
+        else:
+            self.activation = None
+        if dropout:
+            self.dropout = nn.Dropout(dropout)
+        else:
+            self.dropout = None
+        self.layers = nn.ModuleList()
+        for i in range(len(self.dims) - 1):
+            self.layers.append(nn.Linear(self.dims[i], self.dims[i + 1]))
+    def forward(self, input):
+        """"""
+        x = input
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i < len(self.layers) - 1:
+                if self.activation:
+                    x = self.activation(x)
+                if self.dropout:
+                    x = self.dropout(x)
+        return x
+class SmoothCrossEntropyLoss(_WeightedLoss):
+    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
+        super().__init__(weight=weight, reduction=reduction)
+        self.smoothing = smoothing
+        self.weight = weight
+        self.reduction = reduction
+    @staticmethod
+    def _smooth_one_hot(targets:torch.Tensor, n_classes:int, smoothing=0.0):
+        assert 0 <= smoothing < 1
+        with torch.no_grad():
+            targets = torch.empty(size=(targets.size(0), n_classes),
+                    device=targets.device) \
+                .fill_(smoothing /(n_classes-1)) \
+                .scatter_(1, targets.data.unsqueeze(1), 1.-smoothing)
+        return targets
+    def forward(self, inputs, targets):
+        targets = SmoothCrossEntropyLoss._smooth_one_hot(targets, inputs.size(-1),
+            self.smoothing)
+        lsm = F.log_softmax(inputs, -1)
+        if self.weight is not None:
+            lsm = lsm * self.weight.unsqueeze(0)
+        loss = -(targets * lsm).sum(-1)
+        if  self.reduction == 'sum':
+            loss = loss.sum()
+        elif  self.reduction == 'mean':
+            loss = loss.mean()
+        return loss
+class GaussianSmearing(nn.Module):
+    def __init__(self, start=0.0, stop=10.0, num_gaussians=50):
+        super().__init__()
+        offset = torch.linspace(start, stop, num_gaussians)
+        self.coeff = -0.5 / (offset[1] - offset[0]).item()**2
+        self.register_buffer('offset', offset)
+    def forward(self, dist):
+        dist = dist.view(-1, 1) - self.offset.view(1, -1)
+        return torch.exp(self.coeff * torch.pow(dist, 2))
+class ShiftedSoftplus(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.shift = torch.log(torch.tensor(2.0)).item()
+    def forward(self, x):
+        return F.softplus(x) - self.shift
+def compose_context(h_protein, h_ligand, pos_protein, pos_ligand, batch_protein, batch_ligand):
+    batch_ctx = torch.cat([batch_protein, batch_ligand], dim=0)
+    sort_idx = batch_ctx.argsort()
+    mask_protein = torch.cat([
+        torch.ones([batch_protein.size(0)], device=batch_protein.device).bool(),
+        torch.zeros([batch_ligand.size(0)], device=batch_ligand.device).bool(),
+    ], dim=0)[sort_idx]
+    batch_ctx = batch_ctx[sort_idx]
+    h_ctx = torch.cat([h_protein, h_ligand], dim=0)[sort_idx]       # (N_protein+N_ligand, H)
+    pos_ctx = torch.cat([pos_protein, pos_ligand], dim=0)[sort_idx] # (N_protein+N_ligand, 3)
+    return h_ctx, pos_ctx, batch_ctx
+def get_complete_graph(batch):
+    """
+    Args:
+        batch:  Batch index.
+    Returns:
+        edge_index: (2, N_1 + N_2 + ... + N_{B-1}), where N_i is the number of nodes of the i-th graph.
+        neighbors:  (B, ), number of edges per graph.
+    """
+    natoms = scatter_add(torch.ones_like(batch), index=batch, dim=0)
+    natoms_sqr = (natoms ** 2).long()
+    num_atom_pairs = torch.sum(natoms_sqr)
+    natoms_expand = torch.repeat_interleave(natoms, natoms_sqr)
+    index_offset = torch.cumsum(natoms, dim=0) - natoms
+    index_offset_expand = torch.repeat_interleave(index_offset, natoms_sqr)
+    index_sqr_offset = torch.cumsum(natoms_sqr, dim=0) - natoms_sqr
+    index_sqr_offset = torch.repeat_interleave(index_sqr_offset, natoms_sqr)
+    atom_count_sqr = torch.arange(num_atom_pairs, device=num_atom_pairs.device) - index_sqr_offset
+    index1 = (atom_count_sqr // natoms_expand).long() + index_offset_expand
+    index2 = (atom_count_sqr % natoms_expand).long() + index_offset_expand
+    edge_index = torch.cat([index1.view(1, -1), index2.view(1, -1)])
+    mask = torch.logical_not(index1 == index2)
+    edge_index = edge_index[:, mask]
+    num_edges = natoms_sqr - natoms # Number of edges per graph
+    return edge_index, num_edges
+def compose_context_stable(h_protein, h_ligand, pos_protein, pos_ligand, batch_protein, batch_ligand):
+    num_graphs = batch_protein.max().item() + 1
+    batch_ctx = []
+    h_ctx = []
+    pos_ctx = []
+    mask_protein = []
+    for i in range(num_graphs):
+        mask_p, mask_l = (batch_protein == i), (batch_ligand == i)
+        batch_p, batch_l = batch_protein[mask_p], batch_ligand[mask_l]
+        batch_ctx += [batch_p, batch_l]
+        h_ctx += [h_protein[mask_p], h_ligand[mask_l]]
+        pos_ctx += [pos_protein[mask_p], pos_ligand[mask_l]]
+        mask_protein += [
+            torch.ones([batch_p.size(0)], device=batch_p.device, dtype=torch.bool),
+            torch.zeros([batch_l.size(0)], device=batch_l.device, dtype=torch.bool),
+        ]
+    batch_ctx = torch.cat(batch_ctx, dim=0)
+    h_ctx = torch.cat(h_ctx, dim=0)
+    pos_ctx = torch.cat(pos_ctx, dim=0)
+    mask_protein = torch.cat(mask_protein, dim=0)
+    return h_ctx, pos_ctx, batch_ctx, mask_protein
+if __name__ == '__main__':
+    h_protein = torch.randn([60, 64])
+    h_ligand = -torch.randn([33, 64])
+    pos_protein = torch.clamp(torch.randn([60, 3]), 0, float('inf'))
+    pos_ligand = torch.clamp(torch.randn([33, 3]), float('-inf'), 0)
+    batch_protein = torch.LongTensor([0]*10 + [1]*20 + [2]*30)
+    batch_ligand = torch.LongTensor([0]*11 + [1]*11 + [2]*11)
+    h_ctx, pos_ctx, batch_ctx, mask_protein = compose_context_stable(h_protein, h_ligand, pos_protein, pos_ligand, batch_protein, batch_ligand)
+    assert (batch_ctx[mask_protein] == batch_protein).all()
+    assert (batch_ctx[torch.logical_not(mask_protein)] == batch_ligand).all()
+    assert torch.allclose(h_ctx[torch.logical_not(mask_protein)], h_ligand)
+    assert torch.allclose(h_ctx[mask_protein], h_protein)
+    assert torch.allclose(pos_ctx[torch.logical_not(mask_protein)], pos_ligand)
+    assert torch.allclose(pos_ctx[mask_protein], pos_protein)

models/encoders/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from .schnet import SchNetEncoder
+from .tf import TransformerEncoder
+from .gnn import GNN_graphpred, MLP
+def get_encoder(config):
+    if config.name == 'schnet':
+        return SchNetEncoder(
+            hidden_channels = config.hidden_channels,
+            num_filters = config.num_filters,
+            num_interactions = config.num_interactions,
+            edge_channels = config.edge_channels,
+            cutoff = config.cutoff,
+        )
+    elif config.name == 'tf':
+        return TransformerEncoder(
+            hidden_channels = config.hidden_channels,
+            edge_channels = config.edge_channels,
+            key_channels = config.key_channels,
+            num_heads = config.num_heads,
+            num_interactions = config.num_interactions,
+            k = config.knn,
+            cutoff = config.cutoff,
+        )
+    else:
+        raise NotImplementedError('Unknown encoder: %s' % config.name)

models/encoders/gnn.py ADDED Viewed

	@@ -0,0 +1,441 @@

+import torch
+import torch.nn as nn
+from torch_geometric.nn import MessagePassing
+from torch_geometric.utils import add_self_loops, degree, softmax
+from torch_geometric.nn import global_add_pool, global_mean_pool, global_max_pool, GlobalAttention, Set2Set
+import torch.nn.functional as F
+from torch_scatter import scatter_add
+from torch_geometric.nn.inits import glorot, zeros
+num_atom_type = 120 #including the extra mask tokens
+num_chirality_tag = 3
+num_bond_type = 6 #including aromatic and self-loop edge, and extra masked tokens
+num_bond_direction = 3
+class GINConv(MessagePassing):
+    """
+    Extension of GIN aggregation to incorporate edge information by concatenation.
+    Args:
+        emb_dim (int): dimensionality of embeddings for nodes and edges.
+        embed_input (bool): whether to embed input or not.
+    See https://arxiv.org/abs/1810.00826
+    """
+    def __init__(self, emb_dim, aggr = "add"):
+        super(GINConv, self).__init__()
+        #multi-layer perceptron
+        self.mlp = torch.nn.Sequential(torch.nn.Linear(emb_dim, 2*emb_dim), torch.nn.ReLU(), torch.nn.Linear(2*emb_dim, emb_dim))
+        self.edge_embedding1 = torch.nn.Embedding(num_bond_type, emb_dim)
+        self.edge_embedding2 = torch.nn.Embedding(num_bond_direction, emb_dim)
+        torch.nn.init.xavier_uniform_(self.edge_embedding1.weight.data)
+        torch.nn.init.xavier_uniform_(self.edge_embedding2.weight.data)
+        self.aggr = aggr
+    def forward(self, x, edge_index, edge_attr):
+        #add self loops in the edge space
+        edge_index = add_self_loops(edge_index, num_nodes = x.size(0))
+        #add features corresponding to self-loop edges.
+        self_loop_attr = torch.zeros(x.size(0), 2)
+        self_loop_attr[:,0] = 4 #bond type for self-loop edge
+        self_loop_attr = self_loop_attr.to(edge_attr.device).to(edge_attr.dtype)
+        edge_attr = torch.cat((edge_attr, self_loop_attr), dim = 0)
+        edge_embeddings = self.edge_embedding1(edge_attr[:,0]) + self.edge_embedding2(edge_attr[:,1])
+        return self.propagate(edge_index[0], x=x, edge_attr=edge_embeddings)
+    def message(self, x_j, edge_attr):
+        return x_j + edge_attr
+    def update(self, aggr_out):
+        return self.mlp(aggr_out)
+class GCNConv(MessagePassing):
+    def __init__(self, emb_dim, aggr = "add"):
+        super(GCNConv, self).__init__()
+        self.emb_dim = emb_dim
+        self.linear = torch.nn.Linear(emb_dim, emb_dim)
+        self.edge_embedding1 = torch.nn.Embedding(num_bond_type, emb_dim)
+        self.edge_embedding2 = torch.nn.Embedding(num_bond_direction, emb_dim)
+        torch.nn.init.xavier_uniform_(self.edge_embedding1.weight.data)
+        torch.nn.init.xavier_uniform_(self.edge_embedding2.weight.data)
+        self.aggr = aggr
+    def norm(self, edge_index, num_nodes, dtype):
+        ### assuming that self-loops have been already added in edge_index
+        edge_index = edge_index[0]
+        edge_weight = torch.ones((edge_index.size(1), ), dtype=dtype,
+                                     device=edge_index.device)
+        row, col = edge_index
+        deg = scatter_add(edge_weight, row, dim=0, dim_size=num_nodes)
+        deg_inv_sqrt = deg.pow(-0.5)
+        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
+        return deg_inv_sqrt[row] * edge_weight * deg_inv_sqrt[col]
+    def forward(self, x, edge_index, edge_attr):
+        #add self loops in the edge space
+        edge_index = add_self_loops(edge_index, num_nodes = x.size(0))
+        #add features corresponding to self-loop edges.
+        self_loop_attr = torch.zeros(x.size(0), 2)
+        self_loop_attr[:,0] = 4 #bond type for self-loop edge
+        self_loop_attr = self_loop_attr.to(edge_attr.device).to(edge_attr.dtype)
+        edge_attr = torch.cat((edge_attr, self_loop_attr), dim = 0)
+        edge_embeddings = self.edge_embedding1(edge_attr[:,0]) + self.edge_embedding2(edge_attr[:,1])
+        norm = self.norm(edge_index, x.size(0), x.dtype)
+        x = self.linear(x)
+        return self.propagate(edge_index[0], x=x, edge_attr=edge_embeddings, norm=norm)
+    def message(self, x_j, edge_attr, norm):
+        return norm.view(-1, 1) * (x_j + edge_attr)
+class GATConv(MessagePassing):
+    def __init__(self, emb_dim, heads=2, negative_slope=0.2, aggr = "add"):
+        super(GATConv, self).__init__()
+        self.aggr = aggr
+        self.emb_dim = emb_dim
+        self.heads = heads
+        self.negative_slope = negative_slope
+        self.weight_linear = torch.nn.Linear(emb_dim, heads * emb_dim)
+        self.att = torch.nn.Parameter(torch.Tensor(1, heads, 2 * emb_dim))
+        self.bias = torch.nn.Parameter(torch.Tensor(emb_dim))
+        self.edge_embedding1 = torch.nn.Embedding(num_bond_type, heads * emb_dim)
+        self.edge_embedding2 = torch.nn.Embedding(num_bond_direction, heads * emb_dim)
+        torch.nn.init.xavier_uniform_(self.edge_embedding1.weight.data)
+        torch.nn.init.xavier_uniform_(self.edge_embedding2.weight.data)
+        self.reset_parameters()
+    def norm(self, edge_index, num_nodes, dtype):
+        ### assuming that self-loops have been already added in edge_index
+        edge_index = edge_index[0]
+        edge_weight = torch.ones((edge_index.size(1), ), dtype=dtype,
+                                     device=edge_index.device)
+        row, col = edge_index
+        deg = scatter_add(edge_weight, row, dim=0, dim_size=num_nodes)
+        deg_inv_sqrt = deg.pow(-0.5)
+        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
+        return deg_inv_sqrt[row] * edge_weight * deg_inv_sqrt[col]
+    def reset_parameters(self):
+        glorot(self.att)
+        zeros(self.bias)
+    def forward(self, x, edge_index, edge_attr):
+        #add self loops in the edge space
+        edge_index = add_self_loops(edge_index, num_nodes = x.size(0))
+        norm = self.norm(edge_index, x.size(0), x.dtype)
+        #add features corresponding to self-loop edges.
+        self_loop_attr = torch.zeros(x.size(0), 2)
+        self_loop_attr[:,0] = 4 #bond type for self-loop edge
+        self_loop_attr = self_loop_attr.to(edge_attr.device).to(edge_attr.dtype)
+        edge_attr = torch.cat((edge_attr, self_loop_attr), dim = 0)
+        edge_embeddings = self.edge_embedding1(edge_attr[:,0]) + self.edge_embedding2(edge_attr[:,1])
+        x = self.weight_linear(x).view(-1, self.heads, self.emb_dim)
+        return self.propagate(edge_index[0], x=x, edge_attr=edge_embeddings, norm=norm)
+    def message(self, edge_index, x_i, x_j, edge_attr):
+        edge_attr = edge_attr.view(-1, self.heads, self.emb_dim)
+        x_j += edge_attr
+        alpha = (torch.cat([x_i, x_j], dim=-1) * self.att).sum(dim=-1)
+        alpha = F.leaky_relu(alpha, self.negative_slope)
+        alpha = softmax(alpha, edge_index[0])
+        return x_j * alpha.view(-1, self.heads, 1)
+    def update(self, aggr_out):
+        aggr_out = aggr_out.mean(dim=1)
+        aggr_out = aggr_out + self.bias
+        return aggr_out
+class GraphSAGEConv(MessagePassing):
+    def __init__(self, emb_dim, aggr = "mean"):
+        super(GraphSAGEConv, self).__init__()
+        self.emb_dim = emb_dim
+        self.linear = torch.nn.Linear(emb_dim, emb_dim)
+        self.edge_embedding1 = torch.nn.Embedding(num_bond_type, emb_dim)
+        self.edge_embedding2 = torch.nn.Embedding(num_bond_direction, emb_dim)
+        torch.nn.init.xavier_uniform_(self.edge_embedding1.weight.data)
+        torch.nn.init.xavier_uniform_(self.edge_embedding2.weight.data)
+        self.aggr = aggr
+    def norm(self, edge_index, num_nodes, dtype):
+        ### assuming that self-loops have been already added in edge_index
+        edge_index = edge_index[0]
+        edge_weight = torch.ones((edge_index.size(1), ), dtype=dtype,
+                                     device=edge_index.device)
+        row, col = edge_index
+        deg = scatter_add(edge_weight, row, dim=0, dim_size=num_nodes)
+        deg_inv_sqrt = deg.pow(-0.5)
+        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
+        return deg_inv_sqrt[row] * edge_weight * deg_inv_sqrt[col]
+    def forward(self, x, edge_index, edge_attr):
+        #add self loops in the edge space
+        edge_index = add_self_loops(edge_index, num_nodes = x.size(0))
+        #add features corresponding to self-loop edges.
+        self_loop_attr = torch.zeros(x.size(0), 2)
+        self_loop_attr[:,0] = 4 #bond type for self-loop edge
+        self_loop_attr = self_loop_attr.to(edge_attr.device).to(edge_attr.dtype)
+        edge_attr = torch.cat((edge_attr, self_loop_attr), dim = 0)
+        edge_embeddings = self.edge_embedding1(edge_attr[:,0]) + self.edge_embedding2(edge_attr[:,1])
+        norm = self.norm(edge_index, x.size(0), x.dtype)
+        x = self.linear(x)
+        return self.propagate(edge_index[0], x=x, edge_attr=edge_embeddings, norm=norm)
+    def message(self, x_j, edge_attr):
+        return x_j + edge_attr
+    def update(self, aggr_out):
+        return F.normalize(aggr_out, p = 2, dim = -1)
+class GNN(torch.nn.Module):
+    """
+    Args:
+        num_layer (int): the number of GNN layers
+        emb_dim (int): dimensionality of embeddings
+        JK (str): last, concat, max or sum.
+        max_pool_layer (int): the layer from which we use max pool rather than add pool for neighbor aggregation
+        drop_ratio (float): dropout rate
+        gnn_type: gin, gcn, graphsage, gat
+    Output:
+        node representations
+    """
+    def __init__(self, num_layer, emb_dim, JK = "last", drop_ratio = 0, gnn_type = "gin"):
+        super(GNN, self).__init__()
+        self.num_layer = num_layer
+        self.drop_ratio = drop_ratio
+        self.JK = JK
+        if self.num_layer < 2:
+            raise ValueError("Number of GNN layers must be greater than 1.")
+        self.x_embedding1 = torch.nn.Embedding(num_atom_type, emb_dim)
+        self.x_embedding2 = torch.nn.Embedding(num_chirality_tag, emb_dim)
+        torch.nn.init.xavier_uniform_(self.x_embedding1.weight.data)
+        torch.nn.init.xavier_uniform_(self.x_embedding2.weight.data)
+        ###List of MLPs
+        self.gnns = torch.nn.ModuleList()
+        for layer in range(num_layer):
+            if gnn_type == "gin":
+                self.gnns.append(GINConv(emb_dim, aggr = "add"))
+            elif gnn_type == "gcn":
+                self.gnns.append(GCNConv(emb_dim))
+            elif gnn_type == "gat":
+                self.gnns.append(GATConv(emb_dim))
+            elif gnn_type == "graphsage":
+                self.gnns.append(GraphSAGEConv(emb_dim))
+        ###List of batchnorms
+        self.batch_norms = torch.nn.ModuleList()
+        for layer in range(num_layer):
+            self.batch_norms.append(torch.nn.BatchNorm1d(emb_dim))
+    #def forward(self, x, edge_index, edge_attr):
+    def forward(self, *argv):
+        if len(argv) == 3:
+            x, edge_index, edge_attr = argv[0], argv[1], argv[2]
+        elif len(argv) == 1:
+            data = argv[0]
+            x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
+        else:
+            raise ValueError("unmatched number of arguments.")
+        x = self.x_embedding1(x[:,0]) + self.x_embedding2(x[:,1])
+        h_list = [x]
+        for layer in range(self.num_layer):
+            h = self.gnns[layer](h_list[layer], edge_index, edge_attr)
+            h = self.batch_norms[layer](h)
+            #h = F.dropout(F.relu(h), self.drop_ratio, training = self.training)
+            if layer == self.num_layer - 1:
+                #remove relu for the last layer
+                h = F.dropout(h, self.drop_ratio, training = self.training)
+            else:
+                h = F.dropout(F.relu(h), self.drop_ratio, training = self.training)
+            h_list.append(h)
+        ### Different implementations of Jk-concat
+        if self.JK == "concat":
+            node_representation = torch.cat(h_list, dim = 1)
+        elif self.JK == "last":
+            node_representation = h_list[-1]
+        elif self.JK == "max":
+            h_list = [h.unsqueeze_(0) for h in h_list]
+            node_representation = torch.max(torch.cat(h_list, dim = 0), dim = 0)[0]
+        elif self.JK == "sum":
+            h_list = [h.unsqueeze_(0) for h in h_list]
+            node_representation = torch.sum(torch.cat(h_list, dim = 0), dim = 0)[0]
+        return node_representation
+class GNN_graphpred(torch.nn.Module):
+    """
+    Extension of GIN to incorporate edge information by concatenation.
+    Args:
+        num_layer (int): the number of GNN layers
+        emb_dim (int): dimensionality of embeddings
+        num_tasks (int): number of tasks in multi-task learning scenario
+        drop_ratio (float): dropout rate
+        JK (str): last, concat, max or sum.
+        graph_pooling (str): sum, mean, max, attention, set2set
+        gnn_type: gin, gcn, graphsage, gat
+    See https://arxiv.org/abs/1810.00826
+    JK-net: https://arxiv.org/abs/1806.03536
+    """
+    def __init__(self, num_layer, emb_dim, num_tasks, JK = "last", drop_ratio = 0, graph_pooling = "mean", gnn_type = "gin"):
+        super(GNN_graphpred, self).__init__()
+        self.num_layer = num_layer
+        self.drop_ratio = drop_ratio
+        self.JK = JK
+        self.emb_dim = emb_dim
+        self.num_tasks = num_tasks
+        if self.num_layer < 2:
+            raise ValueError("Number of GNN layers must be greater than 1.")
+        self.gnn = GNN(num_layer, emb_dim, JK, drop_ratio, gnn_type = gnn_type)
+        #Different kind of graph pooling
+        if graph_pooling == "sum":
+            self.pool = global_add_pool
+        elif graph_pooling == "mean":
+            self.pool = global_mean_pool
+        elif graph_pooling == "max":
+            self.pool = global_max_pool
+        elif graph_pooling == "attention":
+            if self.JK == "concat":
+                self.pool = GlobalAttention(gate_nn = torch.nn.Linear((self.num_layer + 1) * emb_dim, 1))
+            else:
+                self.pool = GlobalAttention(gate_nn = torch.nn.Linear(emb_dim, 1))
+        elif graph_pooling[:-1] == "set2set":
+            set2set_iter = int(graph_pooling[-1])
+            if self.JK == "concat":
+                self.pool = Set2Set((self.num_layer + 1) * emb_dim, set2set_iter)
+            else:
+                self.pool = Set2Set(emb_dim, set2set_iter)
+        else:
+            raise ValueError("Invalid graph pooling type.")
+        #For graph-level binary classification
+        if graph_pooling[:-1] == "set2set":
+            self.mult = 2
+        else:
+            self.mult = 1
+        if self.JK == "concat":
+            self.graph_pred_linear = torch.nn.Linear(self.mult * (self.num_layer + 1) * self.emb_dim, self.num_tasks)
+        else:
+            self.graph_pred_linear = torch.nn.Linear(self.mult * self.emb_dim, self.num_tasks)
+    def from_pretrained(self, model_file):
+        #self.gnn = GNN(self.num_layer, self.emb_dim, JK = self.JK, drop_ratio = self.drop_ratio)
+        self.gnn.load_state_dict(torch.load(model_file))
+    def forward(self, *argv):
+        if len(argv) == 4:
+            x, edge_index, edge_attr, batch = argv[0], argv[1], argv[2], argv[3]
+        elif len(argv) == 1:
+            data = argv[0]
+            x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
+        else:
+            raise ValueError("unmatched number of arguments.")
+        node_representation = self.gnn(x, edge_index, edge_attr)
+        return self.graph_pred_linear(self.pool(node_representation, batch))
+class MLP(nn.Module):
+    """
+    Creates a NN using nn.ModuleList to automatically adjust the number of layers.
+    For each hidden layer, the number of inputs and outputs is constant.
+    Inputs:
+        in_dim (int):               number of features contained in the input layer.
+        out_dim (int):              number of features input and output from each hidden layer,
+                                    including the output layer.
+        num_layers (int):           number of layers in the network
+        activation (torch function): activation function to be used during the hidden layers
+    """
+    def __init__(self, in_dim, out_dim, num_layers, activation=torch.nn.ReLU(), layer_norm=False, batch_norm=False):
+        super(MLP, self).__init__()
+        self.layers = nn.ModuleList()
+        h_dim = in_dim if out_dim < 10 else out_dim
+        # create the input layer
+        for layer in range(num_layers):
+            if layer == 0:
+                self.layers.append(nn.Linear(in_dim, h_dim))
+            else:
+                self.layers.append(nn.Linear(h_dim, h_dim))
+            if layer_norm: self.layers.append(nn.LayerNorm(h_dim))
+            if batch_norm: self.layers.append(nn.BatchNorm1d(h_dim))
+            self.layers.append(activation)
+        self.layers.append(nn.Linear(h_dim, out_dim))
+    def forward(self, x):
+        for i in range(len(self.layers)):
+            x = self.layers[i](x)
+        return x
+if __name__ == "__main__":
+    pass

models/encoders/schnet.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+import torch.nn.functional as F
+from torch.nn import Module, Sequential, ModuleList, Linear
+from torch_geometric.nn import MessagePassing, radius_graph
+from math import pi as PI
+from ..common import GaussianSmearing, ShiftedSoftplus
+class CFConv(MessagePassing):
+    def __init__(self, in_channels, out_channels, num_filters, edge_channels, cutoff=10.0):
+        super().__init__(aggr='add')
+        self.lin1 = Linear(in_channels, num_filters, bias=False)
+        self.lin2 = Linear(num_filters, out_channels)
+        self.nn = Sequential(
+            Linear(edge_channels, num_filters),
+            ShiftedSoftplus(),
+            Linear(num_filters, num_filters),
+        )   # Network for generating filter weights
+        self.cutoff = cutoff
+        self.reset_parameters()
+    def reset_parameters(self):
+        torch.nn.init.xavier_uniform_(self.nn[0].weight)
+        self.nn[0].bias.data.fill_(0)
+        torch.nn.init.xavier_uniform_(self.nn[2].weight)
+        self.nn[0].bias.data.fill_(0)
+        torch.nn.init.xavier_uniform_(self.lin1.weight)
+        torch.nn.init.xavier_uniform_(self.lin2.weight)
+        self.lin2.bias.data.fill_(0)
+    def forward(self, x, edge_index, edge_length, edge_attr):
+        W = self.nn(edge_attr)
+        if self.cutoff is not None:
+            C = 0.5 * (torch.cos(edge_length * PI / self.cutoff) + 1.0)
+            C = C * (edge_length <= self.cutoff) * (edge_length >= 0.0)     # Modification: cutoff
+            W = W * C.view(-1, 1)
+        x = self.lin1(x)
+        x = self.propagate(edge_index, x=x, W=W)
+        x = self.lin2(x)
+        return x
+    def message(self, x_j, W):
+        return x_j * W
+class InteractionBlock(Module):
+    def __init__(self, hidden_channels, num_gaussians, num_filters, cutoff):
+        super(InteractionBlock, self).__init__()
+        self.conv = CFConv(hidden_channels, hidden_channels, num_filters, num_gaussians, cutoff)
+        self.act = ShiftedSoftplus()
+        self.lin = Linear(hidden_channels, hidden_channels)
+        self.reset_parameters()
+    def reset_parameters(self):
+        self.conv.reset_parameters()
+        torch.nn.init.xavier_uniform_(self.lin.weight)
+        self.lin.bias.data.fill_(0)
+    def forward(self, x, edge_index, edge_length, edge_attr):
+        x = self.conv(x, edge_index, edge_length, edge_attr)
+        x = self.act(x)
+        x = self.lin(x)
+        return x
+class SchNetEncoder(Module):
+    def __init__(self, hidden_channels=128, num_filters=128,
+                num_interactions=6, edge_channels=64, cutoff=10.0):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.num_filters = num_filters
+        self.num_interactions = num_interactions
+        self.distance_expansion = GaussianSmearing(stop=cutoff, num_gaussians=edge_channels)
+        self.cutoff = cutoff
+        self.interactions = ModuleList()
+        for _ in range(num_interactions):
+            block = InteractionBlock(hidden_channels, edge_channels,
+                                     num_filters, cutoff)
+            self.interactions.append(block)
+        self.reset_parameters()
+    def reset_parameters(self):
+        for interaction in self.interactions:
+            interaction.reset_parameters()
+    @property
+    def out_channels(self):
+        return self.hidden_channels
+    def forward(self, node_attr, pos, batch):
+        edge_index = radius_graph(pos, self.cutoff, batch=batch, loop=False)
+        edge_length = torch.norm(pos[edge_index[0]] - pos[edge_index[1]], dim=1)
+        edge_attr = self.distance_expansion(edge_length)
+        h = node_attr
+        for interaction in self.interactions:
+            h = h + interaction(h, edge_index, edge_length, edge_attr)
+        return h

models/encoders/tf.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import Module, Sequential, ModuleList, Linear, Conv1d, LeakyReLU
+from torch_geometric.nn import radius_graph, knn_graph
+from torch_scatter import scatter_sum, scatter_softmax
+import math
+from math import pi as PI
+from ..common import GaussianSmearing, ShiftedSoftplus
+class AttentionInteractionBlock(Module):
+    def __init__(self, hidden_channels, edge_channels, key_channels, num_heads=1):
+        super().__init__()
+        assert hidden_channels % num_heads == 0
+        assert key_channels % num_heads == 0
+        self.hidden_channels = hidden_channels
+        self.key_channels = key_channels
+        self.num_heads = num_heads
+        self.k_lin = Conv1d(hidden_channels, key_channels, 1, groups=num_heads, bias=False)
+        self.q_lin = Conv1d(hidden_channels, key_channels, 1, groups=num_heads, bias=False)
+        self.v_lin = Conv1d(hidden_channels, hidden_channels, 1, groups=num_heads, bias=False)
+        self.weight_k_net = Sequential(
+            Linear(edge_channels, key_channels // num_heads),
+            LeakyReLU(),
+            Linear(key_channels // num_heads, key_channels // num_heads),
+        )
+        self.weight_k_lin = Linear(key_channels // num_heads, key_channels // num_heads)
+        self.weight_v_net = Sequential(
+            Linear(edge_channels, hidden_channels // num_heads),
+            LeakyReLU(),
+            Linear(hidden_channels // num_heads, hidden_channels // num_heads),
+        )
+        self.weight_v_lin = Linear(hidden_channels // num_heads, hidden_channels // num_heads)
+        self.centroid_lin = Linear(hidden_channels, hidden_channels)
+        self.act = LeakyReLU()
+        self.out_transform = Linear(hidden_channels, hidden_channels)
+        self.layernorm_ffn = nn.LayerNorm(hidden_channels)
+    def forward(self, x, edge_index, edge_attr):
+        """
+        Args:
+            x:  Node features, (N, H).
+            edge_index: (2, E).
+            edge_attr:  (E, H)
+        """
+        N = x.size(0)
+        row, col = edge_index  # (E,) , (E,)
+        # Project to multiple key, query and value spaces
+        h_keys = self.k_lin(x.unsqueeze(-1)).view(N, self.num_heads, -1)  # (N, heads, K_per_head)
+        h_queries = self.q_lin(x.unsqueeze(-1)).view(N, self.num_heads, -1)  # (N, heads, K_per_head)
+        h_values = self.v_lin(x.unsqueeze(-1)).view(N, self.num_heads, -1)  # (N, heads, H_per_head)
+        # Compute keys and queries
+        W_k = self.weight_k_net(edge_attr)  # (E, K_per_head)
+        keys_j = self.weight_k_lin(W_k.unsqueeze(1) * h_keys[col])  # (E, heads, K_per_head)
+        queries_i = h_queries[row]  # (E, heads, K_per_head)
+        # Compute attention weights (alphas)
+        d = int(self.hidden_channels / self.num_heads)
+        qk_ij = (queries_i * keys_j).sum(-1) / math.sqrt(d)  # (E, heads)
+        alpha = scatter_softmax(qk_ij, row, dim=0)
+        # Compose messages
+        W_v = self.weight_v_net(edge_attr)  # (E, H_per_head)
+        msg_j = self.weight_v_lin(W_v.unsqueeze(1) * h_values[col])  # (E, heads, H_per_head)
+        msg_j = alpha.unsqueeze(-1) * msg_j  # (E, heads, H_per_head)
+        # Aggregate messages
+        aggr_msg = scatter_sum(msg_j, row, dim=0, dim_size=N).view(N, -1)  # (N, heads*H_per_head)
+        out = self.centroid_lin(x) + aggr_msg
+        out = self.layernorm_ffn(out)
+        out = self.out_transform(self.act(out))
+        return out
+class TransformerEncoder(Module):
+    def __init__(self, hidden_channels=256, edge_channels=64, key_channels=128, num_heads=4, num_interactions=6, k=32,
+                 cutoff=10.0):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.edge_channels = edge_channels
+        self.key_channels = key_channels
+        self.num_heads = num_heads
+        self.num_interactions = num_interactions
+        self.k = k
+        self.cutoff = cutoff
+        self.distance_expansion = GaussianSmearing(stop=cutoff, num_gaussians=edge_channels)
+        self.interactions = ModuleList()
+        for _ in range(num_interactions):
+            block = AttentionInteractionBlock(
+                hidden_channels=hidden_channels,
+                edge_channels=edge_channels,
+                key_channels=key_channels,
+                num_heads=num_heads,
+            )
+            self.interactions.append(block)
+    @property
+    def out_channels(self):
+        return self.hidden_channels
+    def forward(self, node_attr, pos, batch):
+        # edge_index = radius_graph(pos, self.cutoff, batch=batch, loop=False)
+        edge_index = knn_graph(pos, k=self.k, batch=batch, flow='target_to_source')
+        edge_length = torch.norm(pos[edge_index[0]] - pos[edge_index[1]], dim=1)
+        edge_attr = self.distance_expansion(edge_length)
+        h = node_attr
+        for interaction in self.interactions:
+            h = h + interaction(h, edge_index, edge_attr)
+        return h
+if __name__ == '__main__':
+    from torch_geometric.data import Data, Batch
+    hidden_channels = 64
+    edge_channels = 48
+    key_channels = 32
+    num_heads = 4
+    data_list = []
+    for num_nodes in [11, 13, 15]:
+        data_list.append(Data(
+            x=torch.randn([num_nodes, hidden_channels]),
+            pos=torch.randn([num_nodes, 3]) * 2
+        ))
+    batch = Batch.from_data_list(data_list)
+    model = CFTransformerEncoder(
+        hidden_channels=hidden_channels,
+        edge_channels=edge_channels,
+        key_channels=key_channels,
+        num_heads=num_heads,
+    )
+    out = model(batch.x, batch.pos, batch.batch)
+    print(out)
+    print(out.size())

models/flag.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import sys
+sys.path.append("..")
+import torch
+import torch.nn as nn
+from torch.nn import Module, Linear, Embedding
+from torch.nn import functional as F
+from torch_scatter import scatter_add, scatter_mean
+from torch_geometric.data import Data, Batch
+from copy import deepcopy
+from .encoders import get_encoder, GNN_graphpred, MLP
+from .common import *
+from utils import dihedral_utils, chemutils
+class FLAG(Module):
+    def __init__(self, config, protein_atom_feature_dim, ligand_atom_feature_dim, vocab):
+        super().__init__()
+        self.config = config
+        self.vocab = vocab
+        self.protein_atom_emb = Linear(protein_atom_feature_dim, config.hidden_channels)
+        self.ligand_atom_emb = Linear(ligand_atom_feature_dim, config.hidden_channels)
+        self.embedding = nn.Embedding(vocab.size() + 1, config.hidden_channels)
+        self.W = nn.Linear(2 * config.hidden_channels, config.hidden_channels)
+        self.W_o = nn.Linear(config.hidden_channels, self.vocab.size())
+        self.encoder = get_encoder(config.encoder)
+        self.comb_head = GNN_graphpred(num_layer=3, emb_dim=config.hidden_channels, num_tasks=1, JK='last',
+                                       drop_ratio=0.5, graph_pooling='mean', gnn_type='gin')
+        if config.random_alpha:
+            self.alpha_mlp = MLP(in_dim=config.hidden_channels * 4, out_dim=1, num_layers=2)
+        else:
+            self.alpha_mlp = MLP(in_dim=config.hidden_channels * 3, out_dim=1, num_layers=2)
+        self.focal_mlp_ligand = MLP(in_dim=config.hidden_channels, out_dim=1, num_layers=1)
+        self.focal_mlp_protein = MLP(in_dim=config.hidden_channels, out_dim=1, num_layers=1)
+        self.dist_mlp = MLP(in_dim=protein_atom_feature_dim + ligand_atom_feature_dim, out_dim=1, num_layers=2)
+        if config.refinement:
+            self.refine_protein = MLP(in_dim=config.hidden_channels * 2 + config.encoder.edge_channels, out_dim=1, num_layers=2)
+            self.refine_ligand = MLP(in_dim=config.hidden_channels * 2 + config.encoder.edge_channels, out_dim=1, num_layers=2)
+        self.smooth_cross_entropy = SmoothCrossEntropyLoss(reduction='mean', smoothing=0.1)
+        self.pred_loss = nn.CrossEntropyLoss()
+        self.comb_loss = nn.BCEWithLogitsLoss()
+        self.three_hop_loss = torch.nn.MSELoss()
+        self.focal_loss = nn.BCEWithLogitsLoss()
+        self.dist_loss = torch.nn.MSELoss(reduction='mean')
+    def forward(self, protein_pos, protein_atom_feature, ligand_pos, ligand_atom_feature, batch_protein, batch_ligand):
+        h_protein = self.protein_atom_emb(protein_atom_feature)
+        h_ligand = self.ligand_atom_emb(ligand_atom_feature)
+        h_ctx, pos_ctx, batch_ctx, protein_mask = compose_context_stable(h_protein=h_protein, h_ligand=h_ligand,
+                                                                         pos_protein=protein_pos, pos_ligand=ligand_pos,
+                                                                         batch_protein=batch_protein,
+                                                                         batch_ligand=batch_ligand)
+        h_ctx = self.encoder(node_attr=h_ctx, pos=pos_ctx, batch=batch_ctx)  # (N_p+N_l, H)
+        focal_pred = torch.cat([self.focal_mlp_protein(h_ctx[protein_mask]), self.focal_mlp_ligand(h_ctx[~protein_mask])], dim=0)
+        return focal_pred, protein_mask, h_ctx
+    def forward_motif(self, h_ctx_focal, current_wid, current_atoms_batch, n_samples=1):
+        node_hiddens = scatter_add(h_ctx_focal, dim=0, index=current_atoms_batch)
+        motif_hiddens = self.embedding(current_wid)
+        pred_vecs = torch.cat([node_hiddens, motif_hiddens], dim=1)
+        pred_vecs = nn.ReLU()(self.W(pred_vecs))
+        pred_scores = self.W_o(pred_vecs)
+        pred_scores = F.softmax(pred_scores, dim=-1)
+        _, preds = torch.max(pred_scores, dim=1)
+        # random select n_samples in topk
+        k = 5*n_samples
+        select_pool = torch.topk(pred_scores, k, dim=1)[1]
+        index = torch.randint(k, (select_pool.shape[0], n_samples))
+        preds = torch.cat([select_pool[i][index[i]] for i in range(len(index))])
+        idx_parent = torch.repeat_interleave(torch.arange(pred_scores.shape[0]), n_samples, dim=0).to(pred_scores.device)
+        prob = pred_scores[idx_parent, preds]
+        return preds, prob
+    def forward_attach(self, mol_list, next_motif_smiles, device):
+        cand_mols, cand_batch, new_atoms, one_atom_attach, intersection, attach_fail = chemutils.assemble(mol_list, next_motif_smiles)
+        graph_data = Batch.from_data_list([chemutils.mol_to_graph_data_obj_simple(mol) for mol in cand_mols]).to(device)
+        comb_pred = self.comb_head(graph_data.x, graph_data.edge_index, graph_data.edge_attr, graph_data.batch).reshape(-1)
+        slice_idx = torch.cat([torch.tensor([0]), torch.cumsum(cand_batch.bincount(), dim=0)], dim=0)
+        select = [(torch.argmax(comb_pred[slice_idx[i]:slice_idx[i + 1]]) + slice_idx[i]).item() for i in
+                  range(len(slice_idx) - 1)]
+        '''
+        select = []
+        for k in range(len(slice_idx) - 1):
+            id = torch.multinomial(torch.exp(comb_pred[slice_idx[k]:slice_idx[k + 1]]).reshape(-1).float(), 1)
+            select.append((id+slice_idx[k]).item())'''
+        select_mols = [cand_mols[i] for i in select]
+        new_atoms = [new_atoms[i] for i in select]
+        one_atom_attach = [one_atom_attach[i] for i in select]
+        intersection = [intersection[i] for i in select]
+        return select_mols, new_atoms, one_atom_attach, intersection, attach_fail
+    def forward_alpha(self, protein_pos, protein_atom_feature, ligand_pos, ligand_atom_feature, batch_protein,
+                     batch_ligand, xy_index, rotatable):
+        # encode again
+        h_protein = self.protein_atom_emb(protein_atom_feature)
+        h_ligand = self.ligand_atom_emb(ligand_atom_feature)
+        h_ctx, pos_ctx, batch_ctx, protein_mask = compose_context_stable(h_protein=h_protein, h_ligand=h_ligand,
+                                                                         pos_protein=protein_pos, pos_ligand=ligand_pos,
+                                                                         batch_protein=batch_protein,
+                                                                         batch_ligand=batch_ligand)
+        h_ctx = self.encoder(node_attr=h_ctx, pos=pos_ctx, batch=batch_ctx)  # (N_p+N_l, H)
+        h_ctx_ligand = h_ctx[~protein_mask]
+        hx, hy = h_ctx_ligand[xy_index[:, 0]], h_ctx_ligand[xy_index[:, 1]]
+        h_mol = scatter_add(h_ctx_ligand, dim=0, index=batch_ligand)
+        h_mol = h_mol[rotatable]
+        if self.config.random_alpha:
+            rand_dist = torch.distributions.normal.Normal(loc=0, scale=1)
+            rand_alpha = rand_dist.sample(hx.shape).to(hx.device)
+            alpha = self.alpha_mlp(torch.cat([hx, hy, h_mol, rand_alpha], dim=-1))
+        else:
+            alpha = self.alpha_mlp(torch.cat([hx, hy, h_mol], dim=-1))
+        return alpha
+    def get_loss(self, protein_pos, protein_atom_feature, ligand_pos, ligand_atom_feature, ligand_pos_torsion,
+                 ligand_atom_feature_torsion, batch_protein, batch_ligand, batch_ligand_torsion, batch):
+        self.device = protein_pos.device
+        h_protein = self.protein_atom_emb(protein_atom_feature)
+        h_ligand = self.ligand_atom_emb(ligand_atom_feature)
+        loss_list = [0, 0, 0, 0, 0, 0]
+        # Encode for motif prediction
+        h_ctx, pos_ctx, batch_ctx, mask_protein = compose_context_stable(h_protein=h_protein, h_ligand=h_ligand,
+                                                                         pos_protein=protein_pos, pos_ligand=ligand_pos,
+                                                                         batch_protein=batch_protein,
+                                                                         batch_ligand=batch_ligand)
+        h_ctx = self.encoder(node_attr=h_ctx, pos=pos_ctx, batch=batch_ctx)  # (N_p+N_l, H)
+        h_ctx_ligand = h_ctx[~mask_protein]
+        h_ctx_protein = h_ctx[mask_protein]
+        h_ctx_focal = h_ctx[batch['current_atoms']]
+        # Encode for torsion prediction
+        if len(batch['y_pos']) > 0:
+            h_ligand_torsion = self.ligand_atom_emb(ligand_atom_feature_torsion)
+            h_ctx_torison, pos_ctx_torison, batch_ctx_torsion, mask_protein = compose_context_stable(h_protein=h_protein,
+                                                                                                     h_ligand=h_ligand_torsion,
+                                                                                                     pos_protein=protein_pos,
+                                                                                                     pos_ligand=ligand_pos_torsion,
+                                                                                                     batch_protein=batch_protein,
+                                                                                                     batch_ligand=batch_ligand_torsion)
+            h_ctx_torsion = self.encoder(node_attr=h_ctx_torison, pos=pos_ctx_torison, batch=batch_ctx_torsion)  # (N_p+N_l, H)
+            h_ctx_ligand_torsion = h_ctx_torsion[~mask_protein]
+        # next motif prediction
+        node_hiddens = scatter_add(h_ctx_focal, dim=0, index=batch['current_atoms_batch'])
+        motif_hiddens = self.embedding(batch['current_wid'])
+        pred_vecs = torch.cat([node_hiddens, motif_hiddens], dim=1)
+        pred_vecs = nn.ReLU()(self.W(pred_vecs))
+        pred_scores = self.W_o(pred_vecs)
+        pred_loss = self.pred_loss(pred_scores, batch['next_wid'])
+        loss_list[0] = pred_loss.item()
+        # attachment prediction
+        if len(batch['cand_labels']) > 0:
+            cand_mols = batch['cand_mols']
+            comb_pred = self.comb_head(cand_mols.x, cand_mols.edge_index, cand_mols.edge_attr, cand_mols.batch)
+            comb_loss = self.comb_loss(comb_pred, batch['cand_labels'].view(comb_pred.shape).float())
+            loss_list[1] = comb_loss.item()
+        else:
+            comb_loss = 0
+        # focal prediction
+        focal_ligand_pred, focal_protein_pred = self.focal_mlp_ligand(h_ctx_ligand), self.focal_mlp_protein(h_ctx_protein)
+        focal_loss = self.focal_loss(focal_ligand_pred.reshape(-1), batch['ligand_frontier'].float()) +\
+                     self.focal_loss(focal_protein_pred.reshape(-1), batch['protein_contact'].float())
+        loss_list[2] = focal_loss.item()
+        # distance matrix prediction
+        if len(batch['true_dm']) > 0:
+            input = torch.cat([protein_atom_feature[batch['dm_protein_idx']], ligand_atom_feature[batch['dm_ligand_idx']]], dim=-1)
+            pred_dist = self.dist_mlp(input)
+            dm_target = batch['true_dm'].unsqueeze(-1)
+            dm_loss = self.dist_loss(pred_dist, dm_target)
+            loss_list[3] = dm_loss.item()
+        else:
+            dm_loss = 0
+        # structure refinement loss
+        if self.config.refinement and len(batch['true_dm']) > 0:
+            true_distance_alpha = torch.norm(batch['ligand_context_pos'][batch['sr_ligand_idx']] - batch['protein_pos'][batch['sr_protein_idx']], dim=1)
+            true_distance_intra = torch.norm(batch['ligand_context_pos'][batch['sr_ligand_idx0']] - batch['ligand_context_pos'][batch['sr_ligand_idx1']], dim=1)
+            input_distance_alpha = ligand_pos[batch['sr_ligand_idx']] - protein_pos[batch['sr_protein_idx']]
+            input_distance_intra = ligand_pos[batch['sr_ligand_idx0']] - ligand_pos[batch['sr_ligand_idx1']]
+            distance_emb1 = self.encoder.distance_expansion(torch.norm(input_distance_alpha, dim=1))
+            distance_emb2 = self.encoder.distance_expansion(torch.norm(input_distance_intra, dim=1))
+            input1 = torch.cat([h_ctx_ligand[batch['sr_ligand_idx']], h_ctx_protein[batch['sr_protein_idx']], distance_emb1], dim=-1)[true_distance_alpha<=10.0]
+            input2 = torch.cat([h_ctx_ligand[batch['sr_ligand_idx0']], h_ctx_ligand[batch['sr_ligand_idx1']], distance_emb2], dim=-1)[true_distance_intra<=10.0]
+            #distance cut_off
+            norm_dir1 = F.normalize(input_distance_alpha, p=2, dim=1)[true_distance_alpha<=10.0]
+            norm_dir2 = F.normalize(input_distance_intra, p=2, dim=1)[true_distance_intra<=10.0]
+            force1 = scatter_mean(self.refine_protein(input1)*norm_dir1, dim=0, index=batch['sr_ligand_idx'][true_distance_alpha<=10.0], dim_size=ligand_pos.size(0))
+            force2 = scatter_mean(self.refine_ligand(input2)*norm_dir2, dim=0, index=batch['sr_ligand_idx0'][true_distance_intra<=10.0], dim_size=ligand_pos.size(0))
+            new_ligand_pos = deepcopy(ligand_pos)
+            new_ligand_pos += force1
+            new_ligand_pos += force2
+            refine_dist1 = torch.norm(new_ligand_pos[batch['sr_ligand_idx']] - protein_pos[batch['sr_protein_idx']], dim=1)
+            refine_dist2 = torch.norm(new_ligand_pos[batch['sr_ligand_idx0']] - new_ligand_pos[batch['sr_ligand_idx1']], dim=1)
+            sr_loss = (self.dist_loss(refine_dist1, true_distance_alpha) + self.dist_loss(refine_dist2, true_distance_intra))
+            loss_list[5] = sr_loss.item()
+        else:
+            sr_loss = 0
+        # torsion prediction
+        if len(batch['y_pos']) > 0:
+            Hx = dihedral_utils.rotation_matrix_v2(batch['y_pos'])
+            xn_pos = torch.matmul(Hx, batch['xn_pos'].permute(0, 2, 1)).permute(0, 2, 1)
+            yn_pos = torch.matmul(Hx, batch['yn_pos'].permute(0, 2, 1)).permute(0, 2, 1)
+            y_pos = torch.matmul(Hx, batch['y_pos'].unsqueeze(1).permute(0, 2, 1)).squeeze(-1)
+            hx, hy = h_ctx_ligand_torsion[batch['ligand_torsion_xy_index'][:, 0]], h_ctx_ligand_torsion[batch['ligand_torsion_xy_index'][:, 1]]
+            h_mol = scatter_add(h_ctx_ligand_torsion, dim=0, index=batch['ligand_element_torsion_batch'])
+            if self.config.random_alpha:
+                rand_dist = torch.distributions.normal.Normal(loc=0, scale=1)
+                rand_alpha = rand_dist.sample(hx.shape).to(self.device)
+                alpha = self.alpha_mlp(torch.cat([hx, hy, h_mol, rand_alpha], dim=-1))
+            else:
+                alpha = self.alpha_mlp(torch.cat([hx, hy, h_mol], dim=-1))
+            # rotate xn
+            R_alpha = self.build_alpha_rotation(torch.sin(alpha).squeeze(-1), torch.cos(alpha).squeeze(-1))
+            xn_pos = torch.matmul(R_alpha, xn_pos.permute(0, 2, 1)).permute(0, 2, 1)
+            p_idx, q_idx = torch.cartesian_prod(torch.arange(3), torch.arange(3)).chunk(2, dim=-1)
+            p_idx, q_idx = p_idx.squeeze(-1), q_idx.squeeze(-1)
+            pred_sin, pred_cos = dihedral_utils.batch_dihedrals(xn_pos[:, p_idx],
+                                                 torch.zeros_like(y_pos).unsqueeze(1).repeat(1, 9, 1),
+                                                 y_pos.unsqueeze(1).repeat(1, 9, 1),
+                                                 yn_pos[:, q_idx])
+            dihedral_loss = torch.mean(dihedral_utils.von_Mises_loss(batch['true_cos'], pred_cos.reshape(-1), batch['true_sin'], pred_cos.reshape(-1))[batch['dihedral_mask']])
+            torsion_loss = -dihedral_loss
+            loss_list[4] = torsion_loss.item()
+        else:
+            torsion_loss = 0
+        # dm: distance matrix
+        loss = pred_loss + comb_loss + focal_loss + dm_loss + torsion_loss + sr_loss
+        return loss, loss_list
+    def build_alpha_rotation(self, alpha, alpha_cos=None):
+        """
+        Builds the alpha rotation matrix
+        :param alpha: predicted values of torsion parameter alpha (n_dihedral_pairs)
+        :return: alpha rotation matrix (n_dihedral_pairs, 3, 3)
+        """
+        H_alpha = torch.FloatTensor([[[1, 0, 0], [0, 0, 0], [0, 0, 0]]]).repeat(alpha.shape[0], 1, 1).to(self.device)
+        if torch.is_tensor(alpha_cos):
+            H_alpha[:, 1, 1] = alpha_cos
+            H_alpha[:, 1, 2] = -alpha
+            H_alpha[:, 2, 1] = alpha
+            H_alpha[:, 2, 2] = alpha_cos
+        else:
+            H_alpha[:, 1, 1] = torch.cos(alpha)
+            H_alpha[:, 1, 2] = -torch.sin(alpha)
+            H_alpha[:, 2, 1] = torch.sin(alpha)
+            H_alpha[:, 2, 2] = torch.cos(alpha)
+        return H_alpha

motif_sample.py ADDED Viewed

	@@ -0,0 +1,660 @@

+import os
+import shutil
+import argparse
+import random
+import torch
+import numpy as np
+import math
+from vina import Vina
+from openbabel import pybel
+import subprocess
+import multiprocessing as mp
+from functools import partial
+from torch_geometric.data import Batch
+from tqdm.auto import tqdm
+from rdkit import Chem
+from rdkit.Geometry import Point3D
+from torch.utils.data import DataLoader
+from rdkit.Chem.rdchem import BondType
+from rdkit.Chem import ChemicalFeatures, rdMolDescriptors
+from rdkit import RDConfig
+from rdkit.Chem.Descriptors import MolLogP, qed
+from copy import deepcopy
+import tempfile
+import AutoDockTools
+import contextlib
+from torch_scatter import scatter_add, scatter_mean
+from rdkit.Geometry import Point3D
+from meeko import MoleculePreparation
+from meeko import obutils
+from models.flag import FLAG
+from utils.transforms import *
+from utils.datasets import get_dataset
+from utils.misc import *
+from utils.data import *
+from utils.mol_tree import *
+from utils.chemutils import *
+from utils.dihedral_utils import *
+from utils.sascorer import compute_sa_score
+from rdkit.Chem import AllChem
+_fscores = None
+ATOM_FAMILIES = ['Acceptor', 'Donor', 'Aromatic', 'Hydrophobe', 'LumpedHydrophobe', 'NegIonizable', 'PosIonizable',
+                 'ZnBinder']
+ATOM_FAMILIES_ID = {s: i for i, s in enumerate(ATOM_FAMILIES)}
+STATUS_RUNNING = 'running'
+STATUS_FINISHED = 'finished'
+STATUS_FAILED = 'failed'
+def supress_stdout(func):
+    def wrapper(*a, **ka):
+        with open(os.devnull, 'w') as devnull:
+            with contextlib.redirect_stdout(devnull):
+                return func(*a, **ka)
+    return wrapper
+class PrepLig(object):
+    def __init__(self, input_mol, mol_format):
+        if mol_format == 'smi':
+            self.ob_mol = pybel.readstring('smi', input_mol)
+        elif mol_format == 'sdf':
+            self.ob_mol = next(pybel.readfile(mol_format, input_mol))
+        else:
+            raise ValueError(f'mol_format {mol_format} not supported')
+    def addH(self, polaronly=False, correctforph=True, PH=7):
+        self.ob_mol.OBMol.AddHydrogens(polaronly, correctforph, PH)
+        obutils.writeMolecule(self.ob_mol.OBMol, 'tmp_h.sdf')
+    def gen_conf(self):
+        sdf_block = self.ob_mol.write('sdf')
+        rdkit_mol = Chem.MolFromMolBlock(sdf_block, removeHs=False)
+        AllChem.EmbedMolecule(rdkit_mol, Chem.rdDistGeom.ETKDGv3())
+        self.ob_mol = pybel.readstring('sdf', Chem.MolToMolBlock(rdkit_mol))
+        obutils.writeMolecule(self.ob_mol.OBMol, 'conf_h.sdf')
+    @supress_stdout
+    def get_pdbqt(self, lig_pdbqt=None):
+        preparator = MoleculePreparation()
+        preparator.prepare(self.ob_mol.OBMol)
+        if lig_pdbqt is not None:
+            preparator.write_pdbqt_file(lig_pdbqt)
+            return
+        else:
+            return preparator.write_pdbqt_string()
+class PrepProt(object):
+    def __init__(self, pdb_file):
+        self.prot = pdb_file
+    def del_water(self, dry_pdb_file):  # optional
+        with open(self.prot) as f:
+            lines = [l for l in f.readlines() if l.startswith('ATOM') or l.startswith('HETATM')]
+            dry_lines = [l for l in lines if not 'HOH' in l]
+        with open(dry_pdb_file, 'w') as f:
+            f.write(''.join(dry_lines))
+        self.prot = dry_pdb_file
+    def addH(self, prot_pqr):  # call pdb2pqr
+        self.prot_pqr = prot_pqr
+        subprocess.Popen(['pdb2pqr30', '--ff=AMBER', self.prot, self.prot_pqr],
+                         stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL).communicate()
+    def get_pdbqt(self, prot_pdbqt):
+        prepare_receptor = os.path.join(AutoDockTools.__path__[0], 'Utilities24/prepare_receptor4.py')
+        subprocess.Popen(['python3', prepare_receptor, '-r', self.prot_pqr, '-o', prot_pdbqt],
+                         stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL).communicate()
+def calculate_vina(number, pro_path, lig_path):
+    lig_path = os.path.join(lig_path, str(number)+'.sdf')
+    size_factor = 1.2
+    buffer = 5.
+    # openmm_relax(pro_path)
+    # relax_sdf(lig_path)
+    mol = Chem.MolFromMolFile(lig_path, sanitize=True)
+    pos = mol.GetConformer(0).GetPositions()
+    center = np.mean(pos, 0)
+    ligand_pdbqt = './data/tmp/' + str(number) + '_lig.pdbqt'
+    protein_pqr = './data/tmp/' + str(number) + '_pro.pqr'
+    protein_pdbqt = './data/tmp/' + str(number) + '_pro.pdbqt'
+    lig = PrepLig(lig_path, 'sdf')
+    lig.addH()
+    lig.get_pdbqt(ligand_pdbqt)
+    prot = PrepProt(pro_path)
+    prot.addH(protein_pqr)
+    prot.get_pdbqt(protein_pdbqt)
+    v = Vina(sf_name='vina', seed=0, verbosity=0)
+    v.set_receptor(protein_pdbqt)
+    v.set_ligand_from_file(ligand_pdbqt)
+    x, y, z = (pos.max(0) - pos.min(0)) * size_factor + buffer
+    v.compute_vina_maps(center=center, box_size=[x, y, z])
+    energy = v.score()
+    print('Score before minimization: %.3f (kcal/mol)' % energy[0])
+    energy_minimized = v.optimize()
+    print('Score after minimization : %.3f (kcal/mol)' % energy_minimized[0])
+    v.dock(exhaustiveness=64, n_poses=32)
+    score = v.energies(n_poses=1)[0][0]
+    print('Score after docking : %.3f (kcal/mol)' % score)
+    return score
+def get_feat(mol):
+    fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')
+    factory = ChemicalFeatures.BuildFeatureFactory(fdefName)
+    atomic_numbers = torch.LongTensor([6, 7, 8, 9, 15, 16, 17])  # C N O F P S Cl
+    ptable = Chem.GetPeriodicTable()
+    Chem.SanitizeMol(mol)
+    feat_mat = np.zeros([mol.GetNumAtoms(), len(ATOM_FAMILIES)], dtype=np.int_)
+    for feat in factory.GetFeaturesForMol(mol):
+        feat_mat[feat.GetAtomIds(), ATOM_FAMILIES_ID[feat.GetFamily()]] = 1
+    ligand_element = torch.tensor([ptable.GetAtomicNumber(atom.GetSymbol()) for atom in mol.GetAtoms()])
+    element = ligand_element.view(-1, 1) == atomic_numbers.view(1, -1)  # (N_atoms, N_elements)
+    return torch.cat([element, torch.tensor(feat_mat)], dim=-1).float()
+def find_reference(protein_pos, focal_id):
+    # Select three reference protein atoms
+    d = torch.norm(protein_pos - protein_pos[focal_id], dim=1)
+    reference_idx = torch.topk(d, k=4, largest=False)[1]
+    reference_pos = protein_pos[reference_idx]
+    return reference_pos, reference_idx
+def SetAtomNum(mol, atoms):
+    for atom in mol.GetAtoms():
+        if atom.GetIdx() in atoms:
+            atom.SetAtomMapNum(1)
+        else:
+            atom.SetAtomMapNum(0)
+    return mol
+def SetMolPos(mol_list, pos_list):
+    new_mol_list = []
+    for i in range(len(pos_list)):
+        mol = mol_list[i]
+        conf = mol.GetConformer(0)
+        pos = pos_list[i].cpu().double().numpy()
+        if mol.GetNumAtoms() == len(pos):
+            for node in range(mol.GetNumAtoms()):
+                x, y, z = pos[node]
+                conf.SetAtomPosition(node, Point3D(x,y,z))
+            try:
+                AllChem.UFFOptimizeMolecule(mol)
+                new_mol_list.append(mol)
+            except:
+                new_mol_list.append(mol)
+    return new_mol_list
+def lipinski(mol):
+    count = 0
+    if qed(mol) <= 5:
+        count += 1
+    if Chem.Lipinski.NumHDonors(mol) <= 5:
+        count += 1
+    if Chem.Lipinski.NumHAcceptors(mol) <= 10:
+        count += 1
+    if Chem.Descriptors.ExactMolWt(mol) <= 500:
+        count += 1
+    if Chem.Lipinski.NumRotatableBonds(mol) <= 5:
+        count += 1
+    return count
+def refine_pos(ligand_pos, protein_pos, h_ctx_ligand, h_ctx_protein, model, batch, repeats, protein_batch,
+               ligand_batch):
+    protein_offsets = torch.cumsum(protein_batch.bincount(), dim=0)
+    ligand_offsets = torch.cumsum(ligand_batch.bincount(), dim=0)
+    protein_offsets, ligand_offsets = torch.cat([torch.tensor([0]), protein_offsets]), torch.cat([torch.tensor([0]), ligand_offsets])
+    sr_ligand_idx, sr_protein_idx = [], []
+    sr_ligand_idx0, sr_ligand_idx1 = [], []
+    for i in range(len(repeats)):
+        alpha_index = batch['alpha_carbon_indicator'][protein_batch == i].nonzero().reshape(-1)
+        ligand_atom_index = torch.arange(repeats[i])
+        p_idx, q_idx = torch.cartesian_prod(ligand_atom_index, torch.arange(len(alpha_index))).chunk(2, dim=-1)
+        p_idx, q_idx = p_idx.squeeze(-1), q_idx.squeeze(-1)
+        sr_ligand_idx.append(ligand_atom_index[p_idx] + ligand_offsets[i])
+        sr_protein_idx.append(alpha_index[q_idx] + protein_offsets[i])
+        p_idx, q_idx = torch.cartesian_prod(ligand_atom_index, ligand_atom_index).chunk(2, dim=-1)
+        p_idx, q_idx = p_idx.squeeze(-1), q_idx.squeeze(-1)
+        sr_ligand_idx0.append(ligand_atom_index[p_idx] + ligand_offsets[i])
+        sr_ligand_idx1.append(ligand_atom_index[q_idx] + ligand_offsets[i])
+    sr_ligand_idx, sr_protein_idx = torch.cat(sr_ligand_idx).long(), torch.cat(sr_protein_idx).long()
+    sr_ligand_idx0, sr_ligand_idx1 = torch.cat(sr_ligand_idx0).long(), torch.cat(sr_ligand_idx1).long()
+    dist_alpha = torch.norm(ligand_pos[sr_ligand_idx] - protein_pos[sr_protein_idx], dim=1)
+    dist_intra = torch.norm(ligand_pos[sr_ligand_idx0] - ligand_pos[sr_ligand_idx1], dim=1)
+    input_dir_alpha = ligand_pos[sr_ligand_idx] - protein_pos[sr_protein_idx]
+    input_dir_intra = ligand_pos[sr_ligand_idx0] - ligand_pos[sr_ligand_idx1]
+    distance_emb1 = model.encoder.distance_expansion(torch.norm(input_dir_alpha, dim=1))
+    distance_emb2 = model.encoder.distance_expansion(torch.norm(input_dir_intra, dim=1))
+    input1 = torch.cat([h_ctx_ligand[sr_ligand_idx], h_ctx_protein[sr_protein_idx], distance_emb1], dim=-1)[dist_alpha <= 10.0]
+    input2 = torch.cat([h_ctx_ligand[sr_ligand_idx0], h_ctx_ligand[sr_ligand_idx1], distance_emb2], dim=-1)[dist_intra <= 10.0]
+    # distance cut_off
+    norm_dir1 = F.normalize(input_dir_alpha, p=2, dim=1)[dist_alpha <= 10.0]
+    norm_dir2 = F.normalize(input_dir_intra, p=2, dim=1)[dist_intra <= 10.0]
+    force1 = scatter_mean(model.refine_protein(input1) * norm_dir1, dim=0, index=sr_ligand_idx[dist_alpha <= 10.0], dim_size=ligand_pos.size(0))
+    force2 = scatter_mean(model.refine_ligand(input2) * norm_dir2, dim=0, index=sr_ligand_idx0[dist_intra <= 10.0], dim_size=ligand_pos.size(0))
+    ligand_pos += force1
+    ligand_pos += force2
+    ligand_pos = [ligand_pos[ligand_batch==k].float() for k in range(len(repeats))]
+    return ligand_pos
+def ligand_gen(batch, model, vocab, config, center, device, refinement=False):
+    pos_list = []
+    feat_list = []
+    motif_id = [0 for _ in range(config.sample.batch_size)]
+    finished = torch.zeros(config.sample.batch_size).bool()
+    for i in range(config.sample.max_steps):
+        print(i)
+        print(finished)
+        if torch.sum(finished) == config.sample.batch_size:
+            # mol_list = SetMolPos(mol_list, pos_list)
+            return mol_list, pos_list
+        if i == 0:
+            focal_pred, mask_protein, h_ctx = model(protein_pos=batch['protein_pos'],
+                                                    protein_atom_feature=batch['protein_atom_feature'].float(),
+                                                    ligand_pos=batch['ligand_context_pos'],
+                                                    ligand_atom_feature=batch['ligand_context_feature_full'].float(),
+                                                    batch_protein=batch['protein_element_batch'],
+                                                    batch_ligand=batch['ligand_context_element_batch'])
+            protein_atom_feature = batch['protein_atom_feature'].float()
+            focal_protein = focal_pred[mask_protein]
+            h_ctx_protein = h_ctx[mask_protein]
+            focus_score = torch.sigmoid(focal_protein)
+            #can_focus = focus_score > 0.5
+            slice_idx = torch.cat([torch.tensor([0]).to(device), torch.cumsum(batch['protein_element_batch'].bincount(), dim=0)])
+            focal_id = []
+            for j in range(len(slice_idx) - 1):
+                focus = focus_score[slice_idx[j]:slice_idx[j + 1]]
+                focal_id.append(torch.argmax(focus.reshape(-1).float()).item() + slice_idx[j].item())
+            focal_id = torch.tensor(focal_id, device=device)
+            h_ctx_focal = h_ctx_protein[focal_id]
+            current_wid = torch.tensor([vocab.size()] * config.sample.batch_size, device=device)
+            next_motif_wid, motif_prob = model.forward_motif(h_ctx_focal, current_wid, torch.arange(config.sample.batch_size, device=device).to(device))
+            mol_list = [Chem.MolFromSmiles(vocab.get_smiles(id)) for id in next_motif_wid]
+            for j in range(config.sample.batch_size):
+                AllChem.EmbedMolecule(mol_list[j])
+                AllChem.UFFOptimizeMolecule(mol_list[j])
+                ligand_pos, ligand_feat = torch.tensor(mol_list[j].GetConformer().GetPositions(), device=device), get_feat(mol_list[j]).to(device)
+                feat_list.append(ligand_feat)
+                # set the initial positions with distance matrix
+                reference_pos, reference_idx = find_reference(batch['protein_pos'][slice_idx[j]:slice_idx[j + 1]], focal_id[j] - slice_idx[j])
+                p_idx, l_idx = torch.cartesian_prod(torch.arange(4), torch.arange(len(ligand_pos))).chunk(2, dim=-1)
+                p_idx = p_idx.squeeze(-1).to(device)
+                l_idx = l_idx.squeeze(-1).to(device)
+                d_m = model.dist_mlp(torch.cat([protein_atom_feature[reference_idx[p_idx]], ligand_feat[l_idx]], dim=-1)).reshape(4,len(ligand_pos))
+                d_m = d_m ** 2
+                p_d, l_d = self_square_dist(reference_pos), self_square_dist(ligand_pos)
+                D = torch.cat([torch.cat([p_d, d_m], dim=1), torch.cat([d_m.permute(1, 0), l_d], dim=1)])
+                coordinate = eig_coord_from_dist(D)
+                new_pos, _, _ = kabsch_torch(coordinate[:len(reference_pos)], reference_pos,
+                                             coordinate[len(reference_pos):])
+                # new_pos += (center*0.8+torch.mean(reference_pos, dim=0)*0.2) - torch.mean(new_pos, dim=0)
+                new_pos += (center - torch.mean(new_pos, dim=0)) * .8
+                pos_list.append(new_pos)
+            atom_to_motif = [{} for _ in range(config.sample.batch_size)]
+            motif_to_atoms = [{} for _ in range(config.sample.batch_size)]
+            motif_wid = [{} for _ in range(config.sample.batch_size)]
+            for j in range(config.sample.batch_size):
+                for k in range(mol_list[j].GetNumAtoms()):
+                    atom_to_motif[j][k] = 0
+            for j in range(config.sample.batch_size):
+                motif_to_atoms[j][0] = list(np.arange(mol_list[j].GetNumAtoms()))
+                motif_wid[j][0] = next_motif_wid[j].item()
+        else:
+            repeats = torch.tensor([len(pos) for pos in pos_list], device=device)
+            ligand_batch = torch.repeat_interleave(torch.arange(config.sample.batch_size, device=device), repeats)
+            focal_pred, mask_protein, h_ctx = model(protein_pos=batch['protein_pos'].float(),
+                                                    protein_atom_feature=batch['protein_atom_feature'].float(),
+                                                    ligand_pos=torch.cat(pos_list, dim=0).float(),
+                                                    ligand_atom_feature=torch.cat(feat_list, dim=0).float(),
+                                                    batch_protein=batch['protein_element_batch'],
+                                                    batch_ligand=ligand_batch)
+            # structure refinement
+            if refinement:
+                pos_list = refine_pos(torch.cat(pos_list, dim=0).float(), batch['protein_pos'].float(),
+                                      h_ctx[~mask_protein], h_ctx[mask_protein], model, batch, repeats.tolist(),
+                                      batch['protein_element_batch'], ligand_batch)
+            focal_ligand = focal_pred[~mask_protein]
+            h_ctx_ligand = h_ctx[~mask_protein]
+            focus_score = torch.sigmoid(focal_ligand)
+            can_focus = focus_score > 0.
+            slice_idx = torch.cat([torch.tensor([0], device=device), torch.cumsum(repeats, dim=0)])
+            current_atoms_batch, current_atoms = [], []
+            for j in range(len(slice_idx) - 1):
+                focus = focus_score[slice_idx[j]:slice_idx[j + 1]]
+                if torch.sum(can_focus[slice_idx[j]:slice_idx[j + 1]]) > 0 and ~finished[j]:
+                    sample_focal_atom = torch.multinomial(focus.reshape(-1).float(), 1)
+                    focal_motif = atom_to_motif[j][sample_focal_atom.item()]
+                    motif_id[j] = focal_motif
+                else:
+                    finished[j] = True
+                current_atoms.extend((np.array(motif_to_atoms[j][motif_id[j]]) + slice_idx[j].item()).tolist())
+                current_atoms_batch.extend([j] * len(motif_to_atoms[j][motif_id[j]]))
+                mol_list[j] = SetAtomNum(mol_list[j], motif_to_atoms[j][motif_id[j]])
+            # second step: next motif prediction
+            current_wid = [motif_wid[j][motif_id[j]] for j in range(len(mol_list))]
+            next_motif_wid, motif_prob = model.forward_motif(h_ctx_ligand[torch.tensor(current_atoms)],
+                                                 torch.tensor(current_wid).to(device),
+                                                 torch.tensor(current_atoms_batch).to(device))
+            # assemble
+            next_motif_smiles = [vocab.get_smiles(id) for id in next_motif_wid]
+            new_mol_list, new_atoms, one_atom_attach, intersection, attach_fail = model.forward_attach(mol_list, next_motif_smiles, device)
+            for j in range(len(mol_list)):
+                if ~finished[j] and ~attach_fail[j]:
+                    # num_new_atoms
+                    mol_list[j] = new_mol_list[j]
+            rotatable = torch.logical_and(torch.tensor(current_atoms_batch).bincount() == 2, torch.tensor(one_atom_attach))
+            rotatable = torch.logical_and(rotatable, ~torch.tensor(attach_fail))
+            rotatable = torch.logical_and(rotatable, ~finished).to(device)
+            # update motif2atoms and atom2motif
+            for j in range(len(mol_list)):
+                if attach_fail[j] or finished[j]:
+                    continue
+                motif_to_atoms[j][i] = new_atoms[j]
+                motif_wid[j][i] = next_motif_wid[j]
+                for k in new_atoms[j]:
+                    atom_to_motif[j][k] = i
+                    '''
+                    if k in atom_to_motif[j]:
+                        continue
+                    else:
+                        atom_to_motif[j][k] = i'''
+            # generate initial positions
+            for j in range(len(mol_list)):
+                if attach_fail[j] or finished[j]:
+                    continue
+                mol = mol_list[j]
+                anchor = [atom.GetIdx() for atom in mol.GetAtoms() if atom.GetAtomMapNum() == 1]
+                # positions = mol.GetConformer().GetPositions()
+                anchor_pos = deepcopy(pos_list[j][anchor]).to(device)
+                Chem.SanitizeMol(mol)
+                AllChem.EmbedMolecule(mol, useRandomCoords=True)
+                try:
+                    AllChem.UFFOptimizeMolecule(mol)
+                except:
+                    print('UFF error')
+                anchor_pos_new = mol.GetConformer(0).GetPositions()[anchor]
+                new_idx = [atom.GetIdx() for atom in mol.GetAtoms() if atom.GetAtomMapNum() == 2]
+                '''
+                R, T = kabsch(np.matrix(anchor_pos), np.matrix(anchor_pos_new))
+                new_pos = R * np.matrix(mol.GetConformer().GetPositions()[new_idx]).T + np.tile(T, (1, len(new_idx)))
+                new_pos = np.array(new_pos.T)'''
+                new_pos = mol.GetConformer().GetPositions()[new_idx]
+                new_pos, _, _ = kabsch_torch(torch.tensor(anchor_pos_new, device=device), anchor_pos, torch.tensor(new_pos, device=device))
+                conf = mol.GetConformer()
+                # update curated parameters
+                pos_list[j] = torch.cat([pos_list[j], new_pos])
+                feat_list[j] = get_feat(mol_list[j]).to(device)
+                for node in range(mol.GetNumAtoms()):
+                    conf.SetAtomPosition(node, np.array(pos_list[j][node].cpu()))
+                assert mol.GetNumAtoms() == len(pos_list[j])
+            # predict alpha and rotate (only change the position)
+            if torch.sum(rotatable) > 0 and i >= 2:
+                repeats = torch.tensor([len(pos) for pos in pos_list])
+                ligand_batch = torch.repeat_interleave(torch.arange(len(pos_list)), repeats).to(device)
+                slice_idx = torch.cat([torch.tensor([0]), torch.cumsum(repeats, dim=0)])
+                xy_index = [(np.array(motif_to_atoms[j][motif_id[j]]) + slice_idx[j].item()).tolist() for j in range(len(slice_idx) - 1) if rotatable[j]]
+                alpha = model.forward_alpha(protein_pos=batch['protein_pos'].float(),
+                                            protein_atom_feature=batch['protein_atom_feature'].float(),
+                                            ligand_pos=torch.cat(pos_list, dim=0).float(),
+                                            ligand_atom_feature=torch.cat(feat_list, dim=0).float(),
+                                            batch_protein=batch['protein_element_batch'],
+                                            batch_ligand=ligand_batch, xy_index=torch.tensor(xy_index, device=device),
+                                            rotatable=rotatable)
+                rotatable_id = [id for id in range(len(mol_list)) if rotatable[id]]
+                xy_index = [motif_to_atoms[j][motif_id[j]] for j in range(len(slice_idx) - 1) if rotatable[j]]
+                x_index = [intersection[j] for j in range(len(slice_idx) - 1) if rotatable[j]]
+                y_index = [(set(xy_index[k]) - set(x_index[k])).pop() for k in range(len(x_index))]
+                for j in range(len(alpha)):
+                    mol = mol_list[rotatable_id[j]]
+                    new_idx = [atom.GetIdx() for atom in mol.GetAtoms() if atom.GetAtomMapNum() == 2]
+                    positions = deepcopy(pos_list[rotatable_id[j]])
+                    xn_pos = positions[new_idx].float()
+                    dir=(positions[x_index[j]] - positions[y_index[j]]).reshape(-1)
+                    ref=positions[x_index[j]].reshape(-1)
+                    xn_pos = rand_rotate(dir.to(device), ref.to(device), xn_pos.to(device), alpha[j], device=device)
+                    if xn_pos.shape[0] > 0:
+                        pos_list[rotatable_id[j]][-len(xn_pos):] = xn_pos
+                    conf = mol.GetConformer()
+                    for node in range(mol.GetNumAtoms()):
+                        conf.SetAtomPosition(node, np.array(pos_list[rotatable_id[j]][node].cpu()))
+                    assert mol.GetNumAtoms() == len(pos_list[rotatable_id[j]])
+    return mol_list, pos_list
+def demo(data_id):
+    vocab_path = 'vocab.txt'
+    device = 'cpu'
+    config = './configs/sample.yml'
+    vocab = []
+    for line in open(vocab_path):
+        p, _, _ = line.partition(':')
+        vocab.append(p)
+    vocab = Vocab(vocab)
+    # Load configs
+    config = load_config(config)
+    # Data
+    protein_featurizer = FeaturizeProteinAtom()
+    ligand_featurizer = FeaturizeLigandAtom()
+    masking = LigandMaskAll(vocab)
+    transform = Compose([
+        LigandCountNeighbors(),
+        protein_featurizer,
+        ligand_featurizer,
+        FeaturizeLigandBond(),
+        masking,
+    ])
+    dataset, subsets = get_dataset(
+        config=config.dataset,
+        transform=transform,
+    )
+    testset = subsets['test']
+    data = testset[data_id%100]
+    center = data['ligand_center'].to(device)
+    test_set = [data for _ in range(config.sample.num_samples)]
+    # Model (Main)
+    ckpt = torch.load(config.model.checkpoint, map_location=device)
+    model = FLAG(
+        ckpt['config'].model,
+        protein_atom_feature_dim=protein_featurizer.feature_dim,
+        ligand_atom_feature_dim=ligand_featurizer.feature_dim,
+        vocab=vocab,
+    ).to(device)
+    model.load_state_dict(ckpt['model'])
+    # my code goes here
+    sample_loader = DataLoader(test_set, batch_size=config.sample.batch_size,
+                               shuffle=False, num_workers=config.sample.num_workers,
+                               collate_fn=collate_mols)
+    with torch.no_grad():
+        model.eval()
+        number = 0
+        for batch in tqdm(sample_loader):
+            for key in batch:
+                batch[key] = batch[key].to(device)
+            gen_data, pos_list = ligand_gen(batch, model, vocab, config, center, device)
+            SetMolPos(gen_data, pos_list)
+            for mol in gen_data:
+                try:
+                    AllChem.UFFOptimizeMolecule(mol)
+                except:
+                    print('UFF error')
+            for _, mol in enumerate(gen_data):
+                number += 1
+                if mol.GetNumAtoms() < 12 or MolLogP(mol) < 0.60:
+                    continue
+                filename = os.path.join('./data', 'Ligand.sdf')
+                writer = Chem.SDWriter(filename)
+                # writer.SetKekulize(False)
+                writer.write(mol, confId=0)
+                writer.close()
+                return filename
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', type=str, default='./configs/sample.yml')
+    parser.add_argument('-i', '--data_id', type=int, default=0)
+    parser.add_argument('--device', type=str, default='cuda:0')
+    parser.add_argument('--outdir', type=str, default='./outputs')
+    parser.add_argument('--vocab_path', type=str, default='vocab.txt')
+    parser.add_argument('--num_workers', type=int, default=64)
+    args = parser.parse_args()
+    # Load vocab
+    vocab = []
+    for line in open(args.vocab_path):
+        p, _, _ = line.partition(':')
+        vocab.append(p)
+    vocab = Vocab(vocab)
+    # Load configs
+    config = load_config(args.config)
+    config_name = os.path.basename(args.config)[:os.path.basename(args.config).rfind('.')]
+    seed_all(config.sample.seed)
+    # Logging
+    log_dir = get_new_log_dir(args.outdir, prefix='%s-%d' % (config_name, args.data_id))
+    logger = get_logger('sample', log_dir)
+    logger.info(args)
+    logger.info(config)
+    shutil.copyfile(args.config, os.path.join(log_dir, os.path.basename(args.config)))
+    # Data
+    logger.info('Loading data...')
+    protein_featurizer = FeaturizeProteinAtom()
+    ligand_featurizer = FeaturizeLigandAtom()
+    masking = LigandMaskAll(vocab)
+    transform = Compose([
+        LigandCountNeighbors(),
+        protein_featurizer,
+        ligand_featurizer,
+        FeaturizeLigandBond(),
+        masking,
+    ])
+    dataset, subsets = get_dataset(
+        config=config.dataset,
+        transform=transform,
+    )
+    testset = subsets['test']
+    data = testset[args.data_id]
+    center = data['ligand_center'].to(args.device)
+    test_set = [data for _ in range(config.sample.num_samples)]
+    with open(os.path.join(log_dir, 'pocket_info.txt'), 'a') as f:
+        f.write(data['protein_filename'] + '\n')
+    # Model (Main)
+    logger.info('Loading main model...')
+    ckpt = torch.load(config.model.checkpoint, map_location=args.device)
+    model = FLAG(
+        ckpt['config'].model,
+        protein_atom_feature_dim=protein_featurizer.feature_dim,
+        ligand_atom_feature_dim=ligand_featurizer.feature_dim,
+        vocab=vocab,
+    ).to(args.device)
+    model.load_state_dict(ckpt['model'])
+    # my code goes here
+    sample_loader = DataLoader(test_set, batch_size=config.sample.batch_size,
+                               shuffle=False, num_workers=config.sample.num_workers,
+                               collate_fn=collate_mols)
+    data_list = []
+    try:
+        with torch.no_grad():
+            model.eval()
+            number = 0
+            number_list = []
+            for batch in tqdm(sample_loader):
+                for key in batch:
+                    batch[key] = batch[key].to(args.device)
+                gen_data, pos_list = ligand_gen(batch, model, vocab, config, center, args.device)
+                SetMolPos(gen_data, pos_list)
+                for mol in gen_data:
+                    try:
+                        AllChem.UFFOptimizeMolecule(mol)
+                    except:
+                        print('UFF error')
+                data_list.extend(gen_data)
+                with open(os.path.join(log_dir, 'SMILES.txt'), 'a') as smiles_f:
+                    for _, mol in enumerate(gen_data):
+                        number+=1
+                        if mol.GetNumAtoms() < 12 or MolLogP(mol) < 0.60:
+                            continue
+                        smiles_f.write(Chem.MolToSmiles(mol) + '\n')
+                        writer = Chem.SDWriter(os.path.join(log_dir, '%d.sdf' % number))
+                        # writer.SetKekulize(False)
+                        writer.write(mol, confId=0)
+                        writer.close()
+                        number_list.append(number)
+                # Calculate metrics
+                print([Chem.MolToSmiles(mol) for mol in data_list])
+                smiles = [Chem.MolFromSmiles(Chem.MolToSmiles(mol)) for mol in data_list]
+                qed_list = [qed(mol) for mol in smiles if mol.GetNumAtoms() >= 8]
+                logp_list = [MolLogP(mol) for mol in smiles]
+                sa_list = [compute_sa_score(mol) for mol in smiles]
+                Lip_list = [lipinski(mol) for mol in smiles]
+                print('QED %.6f | LogP %.6f | SA %.6f | Lipinski %.6f \n' % (np.average(qed_list), np.average(logp_list), np.average(sa_list), np.average(Lip_list)))
+    except KeyboardInterrupt:
+        logger.info('Terminated. Generated molecules will be saved.')
+        with open(os.path.join(log_dir, 'SMILES.txt'), 'a') as smiles_f:
+            for i, mol in enumerate(data_list):
+                if mol.GetNumAtoms() < 12 or MolLogP(mol) < 0.60:
+                    continue
+                smiles_f.write(Chem.MolToSmiles(mol) + '\n')
+                writer = Chem.SDWriter(os.path.join(log_dir, '%d.sdf' % i))
+                # writer.SetKekulize(False)
+                writer.write(mol, confId=0)
+                writer.close()
+    pool = mp.Pool(args.num_workers)
+    vina_list = []
+    pro_path = '/n/holyscratch01/mzitnik_lab/zaixizhang/pdbbind_pocket10/' + os.path.join(data['pdbid'], data['pdbid']+'_pocket.pdb')
+    for vina_score in tqdm(pool.imap_unordered(partial(calculate_vina, pro_path=pro_path, lig_path=log_dir), number_list), total=len(number_list)):
+        if vina_score != None:
+            vina_list.append(vina_score)
+    pool.close()
+    print('Vina: ', np.average(vina_list))

requirements.txt ADDED Viewed

	@@ -0,0 +1,390 @@

+# This file may be used to create an environment using:
+# $ conda create --name <env> --file <this file>
+# platform: linux-64
+_libgcc_mutex=0.1=conda_forge
+_openmp_mutex=4.5=2_kmp_llvm
+abseil-cpp=20211102.0=hd4dd3e8_0
+absl-py=1.4.0=py38h06a4308_0
+aiofiles=23.2.1=pypi_0
+aiohttp=3.8.5=py38h5eee18b_0
+aiosignal=1.2.0=pyhd3eb1b0_0
+altair=5.1.2=pypi_0
+amberlite=22.0=pypi_0
+ambertools=22.0=py38h6177452_1
+amberutils=21.0=pypi_0
+annotated-types=0.6.0=pypi_0
+antlr4-python3-runtime=4.9.3=pypi_0
+anyio=3.7.1=pypi_0
+appdirs=1.4.4=pyhd3eb1b0_0
+argon2-cffi=21.3.0=pyhd3eb1b0_0
+argon2-cffi-bindings=21.2.0=py38h7f8727e_0
+arpack=3.7.0=hdefa2d7_2
+arrow-cpp=11.0.0=hda39474_2
+asttokens=2.0.5=pyhd3eb1b0_0
+astunparse=1.6.3=py_0
+async-timeout=4.0.2=py38h06a4308_0
+attrs=22.1.0=py38h06a4308_0
+autodocktools-py3=1.5.7.post1+9.gda0c87c=pypi_0
+aws-c-common=0.4.57=he6710b0_1
+aws-c-event-stream=0.1.6=h2531618_5
+aws-checksums=0.1.9=he6710b0_0
+aws-sdk-cpp=1.8.185=hce553d0_0
+backcall=0.2.0=pyhd3eb1b0_0
+beautifulsoup4=4.12.2=py38h06a4308_0
+bio=1.5.9=pypi_0
+biopython=1.81=pypi_0
+biothings-client=0.3.0=pypi_0
+blas=1.0=mkl
+bleach=4.1.0=pyhd3eb1b0_0
+blinker=1.4=py38h06a4308_0
+blosc=1.21.3=h6a678d5_0
+boost=1.74.0=py38h2b96118_5
+boost-cpp=1.74.0=h75c5d50_8
+bottleneck=1.3.5=py38h7deecbd_0
+brotli=1.0.9=h5eee18b_7
+brotli-bin=1.0.9=h5eee18b_7
+brotlipy=0.7.0=py38h27cfd23_1003
+bzip2=1.0.8=h7b6447c_0
+c-ares=1.19.1=h5eee18b_0
+c-blosc2=2.8.0=h6a678d5_0
+ca-certificates=2023.08.22=h06a4308_0
+cachetools=4.2.2=pyhd3eb1b0_0
+cairo=1.16.0=hb05425b_5
+certifi=2023.7.22=py38h06a4308_0
+cffi=1.15.1=py38h74dc2b5_0
+charset-normalizer=2.0.4=pyhd3eb1b0_0
+click=8.0.4=py38h06a4308_0
+comm=0.1.2=py38h06a4308_0
+contourpy=1.0.5=py38hdb19cb5_0
+cryptography=41.0.3=py38h130f0dd_0
+cuda-cudart=11.7.99=0
+cuda-cupti=11.7.101=0
+cuda-libraries=11.7.1=0
+cuda-nvrtc=11.7.99=0
+cuda-nvtx=11.7.91=0
+cuda-runtime=11.7.1=0
+cudatoolkit=11.8.0=h6a678d5_0
+curl=8.2.1=h37d81fd_0
+cycler=0.11.0=pyhd3eb1b0_0
+cython=3.0.2=py38h17151c0_0
+dataclasses=0.8=pyh6d0b6a4_7
+datamol=0.11.4=pypi_0
+datasets=2.12.0=py38h06a4308_0
+debugpy=1.6.7=py38h6a678d5_0
+decorator=5.1.1=pyhd3eb1b0_0
+defusedxml=0.7.1=pyhd3eb1b0_0
+dill=0.3.6=py38h06a4308_0
+docutils=0.17.1=pypi_0
+easydict=1.9=py_0
+entrypoints=0.4=py38h06a4308_0
+exceptiongroup=1.1.3=pypi_0
+executing=0.8.3=pyhd3eb1b0_0
+expat=2.5.0=h6a678d5_0
+fair-esm=2.0.0=pypi_0
+fastapi=0.104.0=pypi_0
+fasteners=0.19=pypi_0
+ffmpeg=4.2.2=h20bf706_0
+ffmpy=0.3.1=pypi_0
+fftw=3.3.10=nompi_hf0379b8_106
+filelock=3.9.0=py38h06a4308_0
+fontconfig=2.14.2=h14ed4e7_0
+fonttools=4.25.0=pyhd3eb1b0_0
+freetype=2.12.1=h4a9f257_0
+frozenlist=1.3.3=py38h5eee18b_0
+fsspec=2023.4.0=py38h06a4308_0
+gflags=2.2.2=he6710b0_0
+giflib=5.2.1=h5eee18b_3
+glib=2.69.1=h4ff587b_1
+glog=0.5.0=h2531618_0
+gmp=6.2.1=h295c915_3
+gnutls=3.6.15=he1e5248_0
+google-auth=2.22.0=py38h06a4308_0
+google-auth-oauthlib=0.5.2=py38h06a4308_0
+gprofiler-official=1.0.0=pypi_0
+gradio=3.50.2=pypi_0
+gradio-client=0.6.1=pypi_0
+greenlet=2.0.1=py38h6a678d5_0
+griddataformats=1.0.1=pypi_0
+grpc-cpp=1.48.2=h5bf31a4_0
+grpcio=1.48.2=py38h5bf31a4_0
+gsd=3.1.1=pypi_0
+h11=0.14.0=pypi_0
+hdf4=4.2.15=h9772cbc_5
+hdf5=1.12.1=nompi_h2386368_104
+httpcore=0.18.0=pypi_0
+httpx=0.25.0=pypi_0
+huggingface_hub=0.15.1=py38h06a4308_0
+icu=70.1=h27087fc_0
+idna=3.4=py38h06a4308_0
+importlib-metadata=6.0.0=py38h06a4308_0
+importlib_metadata=6.0.0=hd3eb1b0_0
+importlib_resources=5.2.0=pyhd3eb1b0_1
+intel-openmp=2021.4.0=h06a4308_3561
+ipykernel=6.25.0=py38h2f386ee_0
+ipython=8.12.2=py38h06a4308_0
+ipython_genutils=0.2.0=pyhd3eb1b0_1
+jedi=0.18.1=py38h06a4308_1
+jinja2=3.1.2=py38h06a4308_0
+joblib=1.2.0=py38h06a4308_0
+jpeg=9e=h5eee18b_1
+jsonschema=4.17.3=py38h06a4308_0
+jupyter_client=7.4.9=py38h06a4308_0
+jupyter_core=5.3.0=py38h06a4308_0
+jupyter_server=1.23.4=py38h06a4308_0
+jupyterlab_pygments=0.1.2=py_0
+kiwisolver=1.4.4=py38h6a678d5_0
+krb5=1.20.1=h568e23c_1
+lame=3.100=h7b6447c_0
+lcms2=2.12=h3be6417_0
+ld_impl_linux-64=2.38=h1181459_1
+lerc=3.0=h295c915_0
+libblas=3.9.0=12_linux64_mkl
+libbrotlicommon=1.0.9=h5eee18b_7
+libbrotlidec=1.0.9=h5eee18b_7
+libbrotlienc=1.0.9=h5eee18b_7
+libcublas=11.10.3.66=0
+libcufft=10.7.2.124=h4fbf590_0
+libcufile=1.7.1.12=0
+libcurand=10.3.3.129=0
+libcurl=8.2.1=h91b91d3_0
+libcusolver=11.4.0.1=0
+libcusparse=11.7.4.91=0
+libdeflate=1.17=h5eee18b_0
+libedit=3.1.20221030=h5eee18b_0
+libev=4.33=h7f8727e_1
+libevent=2.1.12=h8f2d780_0
+libffi=3.3=he6710b0_2
+libgcc-ng=13.1.0=he5830b7_0
+libgfortran-ng=11.2.0=h00389a5_1
+libgfortran5=11.2.0=h1234567_1
+libgomp=13.1.0=he5830b7_0
+libiconv=1.16=h7f8727e_2
+libidn2=2.3.4=h5eee18b_0
+liblapack=3.9.0=12_linux64_mkl
+libnetcdf=4.8.1=nompi_h329d8a1_102
+libnghttp2=1.52.0=ha637b67_1
+libnpp=11.7.4.75=0
+libnsl=2.0.0=h7f98852_0
+libnvjpeg=11.8.0.2=0
+libopus=1.3.1=h7b6447c_0
+libpng=1.6.39=h5eee18b_0
+libprotobuf=3.20.3=he621ea3_0
+libsodium=1.0.18=h7b6447c_0
+libssh2=1.10.0=h37d81fd_2
+libstdcxx-ng=13.1.0=hfd8a6a1_0
+libtasn1=4.19.0=h5eee18b_0
+libthrift=0.15.0=h0d84882_2
+libtiff=4.5.1=h6a678d5_0
+libunistring=0.9.10=h27cfd23_0
+libuuid=2.38.1=h0b41bf4_0
+libvpx=1.7.0=h439df22_0
+libwebp=1.2.4=h11a3e52_1
+libwebp-base=1.2.4=h5eee18b_1
+libxcb=1.15=h7f8727e_0
+libxml2=2.9.14=h22db469_4
+libxslt=1.1.35=h8affb1d_0
+libzip=1.9.2=hc869a4a_1
+libzlib=1.2.13=hd590300_5
+littleutils=0.2.2=pypi_0
+llvm-openmp=14.0.6=h9e868ea_0
+loguru=0.7.2=pypi_0
+lxml=4.9.1=py38h1edc446_0
+lz4-c=1.9.4=h6a678d5_0
+lzo=2.10=h7b6447c_2
+markdown=3.4.1=py38h06a4308_0
+markupsafe=2.1.1=py38h7f8727e_0
+matplotlib-base=3.7.2=py38h1128e8f_0
+matplotlib-inline=0.1.6=py38h06a4308_0
+mdanalysis=2.4.3=pypi_0
+mdtraj=1.9.9=py38h028faf2_0
+meeko=0.1.dev3=pypi_0
+mistune=0.8.4=py38h7b6447c_1000
+mkl=2021.4.0=h06a4308_640
+mkl-service=2.4.0=py38h7f8727e_0
+mkl_fft=1.3.1=py38hd3c417c_0
+mkl_random=1.2.2=py38h51133e4_0
+mmcif-pdbx=2.0.1=pypi_0
+mmpbsa-py=16.0=pypi_0
+mmtf-python=1.1.3=pypi_0
+mrcfile=1.4.3=pypi_0
+mscorefonts=0.0.1=3
+msgpack=1.0.6=pypi_0
+multidict=6.0.2=py38h5eee18b_0
+multiprocess=0.70.14=py38h06a4308_0
+munkres=1.1.4=py_0
+mygene=3.2.2=pypi_0
+nb_conda_kernels=2.3.1=py38h06a4308_0
+nbclassic=0.5.5=py38h06a4308_0
+nbclient=0.5.13=py38h06a4308_0
+nbconvert=6.5.4=py38h06a4308_0
+nbformat=5.9.2=py38h06a4308_0
+ncurses=6.4=h6a678d5_0
+nest-asyncio=1.5.6=py38h06a4308_0
+netcdf-fortran=4.5.4=nompi_h2b6e579_100
+nettle=3.7.3=hbbd107a_1
+networkx=3.1=pyhd8ed1ab_0
+notebook=6.5.4=py38h06a4308_1
+notebook-shim=0.2.2=py38h06a4308_0
+numexpr=2.8.4=py38he184ba9_0
+numpy=1.24.3=py38h14f4228_0
+numpy-base=1.24.3=py38h31eccc5_0
+oauthlib=3.2.2=py38h06a4308_0
+ocl-icd=2.3.1=h7f98852_0
+ocl-icd-system=1.0.0=1
+ogb=1.3.6=pypi_0
+omegaconf=2.3.0=pypi_0
+openbabel=3.1.1=py38hd2c4bc0_3
+openff-forcefields=2023.08.0=pyh1a96a4e_0
+openff-toolkit=0.10.7=pyhd8ed1ab_0
+openff-toolkit-base=0.10.7=pyhd8ed1ab_0
+openh264=2.1.1=h4ff587b_0
+openmm=8.0.0=py38hd11a18e_1
+openssl=1.1.1w=h7f8727e_0
+opt-einsum=3.3.0=pypi_0
+orc=1.7.4=hb3bc3d3_1
+orjson=3.9.9=pypi_0
+outdated=0.2.2=pypi_0
+packaging=23.1=py38h06a4308_0
+packmol=20.010=h86c2bf4_0
+packmol-memgen=1.2.3rc0=pypi_0
+pandas=2.0.3=py38h1128e8f_0
+pandocfilters=1.5.0=pyhd3eb1b0_0
+parmed=3.4.4=py38h8dc9893_0
+parso=0.8.3=pyhd3eb1b0_0
+pcre=8.45=h295c915_0
+pdb2pqr=3.6.1=pypi_0
+pdb4amber=22.0=pypi_0
+pdbfixer=1.9=pyh1a96a4e_0
+perl=5.32.1=4_hd590300_perl5
+pexpect=4.8.0=pyhd3eb1b0_3
+pickleshare=0.7.5=pyhd3eb1b0_1003
+pillow=9.4.0=py38h6a678d5_0
+pip=23.2.1=py38h06a4308_0
+pixman=0.40.0=h7f8727e_1
+pkgutil-resolve-name=1.3.10=py38h06a4308_0
+platformdirs=3.10.0=py38h06a4308_0
+pooch=1.4.0=pyhd3eb1b0_0
+posebusters=0.2.6=pypi_0
+posecheck=0.1=dev_0
+prolif=2.0.0.post1=pypi_0
+prometheus_client=0.14.1=py38h06a4308_0
+prompt-toolkit=3.0.36=py38h06a4308_0
+propka=3.5.0=pypi_0
+protobuf=3.20.3=py38h6a678d5_0
+psutil=5.9.0=py38h5eee18b_0
+ptyprocess=0.7.0=pyhd3eb1b0_2
+pure_eval=0.2.2=pyhd3eb1b0_0
+py-cpuinfo=8.0.0=pyhd3eb1b0_1
+py3dmol=2.0.4=pypi_0
+pyarrow=11.0.0=py38h468efa6_1
+pyasn1=0.4.8=pyhd3eb1b0_0
+pyasn1-modules=0.2.8=py_0
+pycairo=1.23.0=py38hd1222b9_0
+pycparser=2.21=pyhd3eb1b0_0
+pydantic=2.4.2=pypi_0
+pydantic-core=2.10.1=pypi_0
+pydub=0.25.1=pypi_0
+pyg-lib=0.2.0+pt113cu117=pypi_0
+pygments=2.15.1=py38h06a4308_1
+pygsp=0.5.1=pypi_0
+pyjwt=2.4.0=py38h06a4308_0
+pyopenssl=23.2.0=py38h06a4308_0
+pyparsing=3.0.9=py38h06a4308_0
+pyrsistent=0.18.0=py38heee7806_0
+pysocks=1.7.1=py38h06a4308_0
+pytables=3.8.0=py38hb8ae3fc_3
+python=3.8.11=h12debd9_0_cpython
+python-constraint=1.4.0=py_0
+python-dateutil=2.8.2=pyhd3eb1b0_0
+python-fastjsonschema=2.16.2=py38h06a4308_0
+python-lmdb=1.4.1=py38h6a678d5_0
+python-multipart=0.0.6=pypi_0
+python-tzdata=2023.3=pyhd3eb1b0_0
+python-xxhash=2.0.2=py38h5eee18b_1
+python_abi=3.8=3_cp38
+pytorch=1.13.0=py3.8_cuda11.7_cudnn8.5.0_0
+pytorch-cuda=11.7=h778d358_5
+pytorch-mutex=1.0=cuda
+pytraj=2.0.6=pypi_0
+pytz=2022.7=py38h06a4308_0
+pyyaml=6.0=py38h5eee18b_1
+pyzmq=23.2.0=py38h6a678d5_0
+qvina=2.1.0=h62396cd_2
+rdkit=2023.3.3=pypi_0
+re2=2022.04.01=h295c915_0
+readline=8.2=h5eee18b_0
+reduce=3.24=0
+regex=2022.7.9=py38h5eee18b_0
+reportlab=3.5.67=py38hfdd840d_1
+requests=2.31.0=py38h06a4308_0
+requests-oauthlib=1.3.0=py_0
+responses=0.13.3=pyhd3eb1b0_0
+rsa=4.7.2=pyhd3eb1b0_1
+sacremoses=0.0.43=pyhd3eb1b0_0
+safetensors=0.3.2=py38hb02cf49_0
+sander=22.0=pypi_0
+scikit-learn=1.3.0=py38h1128e8f_0
+scipy=1.10.1=py38h14f4228_0
+seaborn=0.12.2=pypi_0
+selfies=2.1.1=pypi_0
+semantic-version=2.10.0=pypi_0
+send2trash=1.8.0=pyhd3eb1b0_1
+setuptools=68.0.0=py38h06a4308_0
+six=1.16.0=pyhd3eb1b0_1
+smirnoff99frosst=1.1.0=pyh44b312d_0
+snappy=1.1.9=h295c915_0
+sniffio=1.2.0=py38h06a4308_1
+sortedcontainers=2.4.0=pypi_0
+soupsieve=2.4=py38h06a4308_0
+sqlalchemy=1.4.39=py38h5eee18b_0
+sqlite=3.41.2=h5eee18b_0
+stack_data=0.2.0=pyhd3eb1b0_0
+starlette=0.27.0=pypi_0
+tensorboard=2.12.1=py38h06a4308_0
+tensorboard-data-server=0.7.0=py38h52d8a92_0
+tensorboard-plugin-wit=1.8.1=py38h06a4308_0
+terminado=0.17.1=py38h06a4308_0
+threadpoolctl=2.2.0=pyh0d69192_0
+tinycss2=1.2.1=py38h06a4308_0
+tk=8.6.12=h1ccaba5_0
+tokenizers=0.13.2=py38he7d60b5_1
+toolz=0.12.0=pypi_0
+torch-cluster=1.6.1+pt113cu117=pypi_0
+torch-geometric=2.3.1=pypi_0
+torch-scatter=2.1.1+pt113cu117=pypi_0
+torch-sparse=0.6.17+pt113cu117=pypi_0
+torch-spline-conv=1.2.2+pt113cu117=pypi_0
+tornado=6.3.2=py38h5eee18b_0
+tqdm=4.65.0=py38hb070fc8_0
+traitlets=5.7.1=py38h06a4308_0
+transformers=4.31.0=pyhd8ed1ab_0
+typing-extensions=4.8.0=pypi_0
+urllib3=1.26.16=py38h06a4308_0
+utf8proc=2.6.1=h27cfd23_0
+uvicorn=0.23.2=pypi_0
+vina=1.2.2=pypi_0
+wcwidth=0.2.5=pyhd3eb1b0_0
+webencodings=0.5.1=py38_1
+websocket-client=0.58.0=py38h06a4308_4
+websockets=11.0.3=pypi_0
+werkzeug=2.2.3=py38h06a4308_0
+wheel=0.38.4=py38h06a4308_0
+x264=1!157.20191217=h7b6447c_0
+xmltodict=0.13.0=pyhd8ed1ab_0
+xorg-kbproto=1.0.7=h7f98852_1002
+xorg-libice=1.1.1=hd590300_0
+xorg-libsm=1.2.4=h7391055_0
+xorg-libx11=1.8.6=h8ee46fc_0
+xorg-libxext=1.3.4=h0b41bf4_2
+xorg-libxt=1.3.0=hd590300_1
+xorg-xextproto=7.3.0=h0b41bf4_1003
+xorg-xproto=7.0.31=h7f98852_1007
+xxhash=0.8.0=h7f8727e_3
+xz=5.2.10=h5eee18b_1
+yaml=0.2.5=h7b6447c_0
+yarl=1.8.1=py38h5eee18b_0
+zeromq=4.3.4=h2531618_0
+zipp=3.11.0=py38h06a4308_0
+zlib=1.2.13=hd590300_5
+zlib-ng=2.0.7=h5eee18b_0
+zstd=1.5.5=hc292b87_0

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .dihedral_utils import batch_dihedrals, rotation_matrix_v2, von_Mises_loss
+from .chemutils import get_clique_mol, tree_decomp, get_mol, get_smiles, set_atommap, get_clique_mol_simple, assemble, mol_to_graph_data_obj_simple
+from .dihedral_utils import rotation_matrix_v2, von_Mises_loss, batch_dihedrals

utils/chem.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import copy
+import torch
+from io import BytesIO
+from openbabel import openbabel
+from torch_geometric.utils import to_networkx
+from torch_geometric.data import Data
+from torch_scatter import scatter
+from rdkit import Chem
+from rdkit.Chem.rdchem import Mol, HybridizationType, BondType
+from rdkit.Chem.rdchem import BondType as BT
+BOND_TYPES = {t: i for i, t in enumerate(BT.names.values())}
+BOND_NAMES = {i: t for i, t in enumerate(BT.names.keys())}
+def rdmol_to_data(mol, smiles=None):
+    assert mol.GetNumConformers() == 1
+    N = mol.GetNumAtoms()
+    pos = torch.tensor(mol.GetConformer(0).GetPositions(), dtype=torch.float32)
+    atomic_number = []
+    aromatic = []
+    sp = []
+    sp2 = []
+    sp3 = []
+    num_hs = []
+    for atom in mol.GetAtoms():
+        atomic_number.append(atom.GetAtomicNum())
+        aromatic.append(1 if atom.GetIsAromatic() else 0)
+        hybridization = atom.GetHybridization()
+        sp.append(1 if hybridization == HybridizationType.SP else 0)
+        sp2.append(1 if hybridization == HybridizationType.SP2 else 0)
+        sp3.append(1 if hybridization == HybridizationType.SP3 else 0)
+    z = torch.tensor(atomic_number, dtype=torch.long)
+    row, col, edge_type = [], [], []
+    for bond in mol.GetBonds():
+        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
+        row += [start, end]
+        col += [end, start]
+        edge_type += 2 * [BOND_TYPES[bond.GetBondType()]]
+    edge_index = torch.tensor([row, col], dtype=torch.long)
+    edge_type = torch.tensor(edge_type)
+    perm = (edge_index[0] * N + edge_index[1]).argsort()
+    edge_index = edge_index[:, perm]
+    edge_type = edge_type[perm]
+    row, col = edge_index
+    hs = (z == 1).to(torch.float32)
+    num_hs = scatter(hs[row], col, dim_size=N, reduce='sum').tolist()
+    if smiles is None:
+        smiles = Chem.MolToSmiles(Chem.RemoveHs(mol))
+    data = Data(atom_type=z, pos=pos, edge_index=edge_index, edge_type=edge_type,
+                rdmol=copy.deepcopy(mol), smiles=smiles)
+    data.nx = to_networkx(data, to_undirected=True)
+    return data
+def generated_to_xyz(data):
+    ptable = Chem.GetPeriodicTable()
+    num_atoms = data.ligand_context_element.size(0)
+    xyz = "%d\n\n" % (num_atoms, )
+    for i in range(num_atoms):
+        symb = ptable.GetElementSymbol(data.ligand_context_element[i].item())
+        x, y, z = data.ligand_context_pos[i].clone().cpu().tolist()
+        xyz += "%s %.8f %.8f %.8f\n" % (symb, x, y, z)
+    return xyz
+def generated_to_sdf(data):
+    xyz = generated_to_xyz(data)
+    obConversion = openbabel.OBConversion()
+    obConversion.SetInAndOutFormats("xyz", "sdf")
+    mol = openbabel.OBMol()
+    obConversion.ReadString(mol, xyz)
+    sdf = obConversion.WriteString(mol)
+    return sdf
+def sdf_to_rdmol(sdf):
+    stream = BytesIO(sdf.encode())
+    suppl = Chem.ForwardSDMolSupplier(stream)
+    for mol in suppl:
+        return mol
+    return None
+def generated_to_rdmol(data):
+    sdf = generated_to_sdf(data)
+    return sdf_to_rdmol(sdf)
+def filter_rd_mol(rdmol):
+    ring_info = rdmol.GetRingInfo()
+    ring_info.AtomRings()
+    rings = [set(r) for r in ring_info.AtomRings()]
+    # 3-3 ring intersection
+    for i, ring_a in enumerate(rings):
+        if len(ring_a) != 3:continue
+        for j, ring_b in enumerate(rings):
+            if i <= j: continue
+            inter = ring_a.intersection(ring_b)
+            if (len(ring_b) == 3) and (len(inter) > 0):
+                return False
+    return True

utils/chemutils.py ADDED Viewed

	@@ -0,0 +1,597 @@

+import rdkit
+import rdkit.Chem as Chem
+from scipy.sparse import csr_matrix
+from scipy.sparse.csgraph import minimum_spanning_tree
+from collections import defaultdict
+from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions
+from rdkit.Chem.Descriptors import MolLogP, qed
+from torch_geometric.data import Data, Batch
+from random import sample
+from rdkit.Chem.rdForceFieldHelpers import UFFOptimizeMolecule
+import numpy as np
+from math import sqrt
+import torch
+from copy import deepcopy
+MST_MAX_WEIGHT = 100
+MAX_NCAND = 2000
+def vina_score(mol):
+    ligand_rdmol = Chem.AddHs(mol, addCoords=True)
+    if use_uff:
+        UFFOptimizeMolecule(ligand_rdmol)
+def lipinski(mol):
+    if qed(mol)<=5 and Chem.Lipinski.NumHDonors(mol)<=5 and Chem.Lipinski.NumHAcceptors(mol)<=10 and Chem.Descriptors.ExactMolWt(mol)<=500 and Chem.Lipinski.NumRotatableBonds(mol)<=5:
+        return True
+    else:
+        return False
+def list_filter(a,b):
+    filter = []
+    for i in a:
+        if i in b:
+            filter.append(i)
+    return filter
+def rand_rotate(dir, ref, pos, alpha=None, device=None):
+    if device is None:
+        device = 'cpu'
+    dir = dir/torch.norm(dir)
+    if alpha is None:
+        alpha = torch.randn(1).to(device)
+    n_pos = pos.shape[0]
+    sin, cos = torch.sin(alpha).to(device), torch.cos(alpha).to(device)
+    K = 1 - cos
+    M = torch.dot(dir, ref)
+    nx, ny, nz = dir[0], dir[1], dir[2]
+    x0, y0, z0 = ref[0], ref[1], ref[2]
+    T = torch.tensor([nx ** 2 * K + cos, nx * ny * K - nz * sin, nx * nz * K + ny * sin,
+         (x0 - nx * M) * K + (nz * y0 - ny * z0) * sin,
+         nx * ny * K + nz * sin, ny ** 2 * K + cos, ny * nz * K - nx * sin,
+         (y0 - ny * M) * K + (nx * z0 - nz * x0) * sin,
+         nx * nz * K - ny * sin, ny * nz * K + nx * sin, nz ** 2 * K + cos,
+         (z0 - nz * M) * K + (ny * x0 - nx * y0) * sin,
+         0, 0, 0, 1], device=device).reshape(4, 4)
+    pos = torch.cat([pos.t(), torch.ones(n_pos, device=device).unsqueeze(0)], dim=0)
+    rotated_pos = torch.mm(T, pos)[:3]
+    return rotated_pos.t()
+def kabsch(A, B):
+    # Input:
+    #     Nominal  A Nx3 matrix of points
+    #     Measured B Nx3 matrix of points
+    # Returns R,t
+    # R = 3x3 rotation matrix (B to A)
+    # t = 3x1 translation vector (B to A)
+    assert len(A) == len(B)
+    N = A.shape[0] # total points
+    centroid_A = np.mean(A, axis=0)
+    centroid_B = np.mean(B, axis=0)
+    # center the points
+    AA = A - np.tile(centroid_A, (N, 1))
+    BB = B - np.tile(centroid_B, (N, 1))
+    H = np.transpose(BB) * AA
+    U, S, Vt = np.linalg.svd(H)
+    R = Vt.T * U.T
+    # special reflection case
+    if np.linalg.det(R) < 0:
+        Vt[2, :] *= -1
+        R = Vt.T * U.T
+    t = -R * centroid_B.T + centroid_A.T
+    return R, t
+def kabsch_torch(A, B, C):
+    A=A.double()
+    B=B.double()
+    C=C.double()
+    a_mean = A.mean(dim=0, keepdims=True)
+    b_mean = B.mean(dim=0, keepdims=True)
+    A_c = A - a_mean
+    B_c = B - b_mean
+    # Covariance matrix
+    H = torch.matmul(A_c.transpose(0,1), B_c)  # [B, 3, 3]
+    U, S, V = torch.svd(H)
+    # Rotation matrix
+    R = torch.matmul(V, U.transpose(0,1))  # [B, 3, 3]
+    # Translation vector
+    t = b_mean - torch.matmul(R, a_mean.transpose(0,1)).transpose(0,1)
+    C_aligned = torch.matmul(R, C.transpose(0,1)).transpose(0,1) + t
+    return C_aligned, R, t
+def eig_coord_from_dist(D):
+    M = (D[:1, :] + D[:, :1] - D) / 2
+    L, V = torch.linalg.eigh(M)
+    L = torch.diag_embed(torch.sort(L, descending=True)[0])
+    X = torch.matmul(V, L.clamp(min=0).sqrt())
+    return X[:, :3].detach()
+def self_square_dist(X):
+    dX = X.unsqueeze(0) - X.unsqueeze(1)  # [1, N, 3] - [N, 1, 3]
+    D = torch.sum(dX**2, dim=-1)
+    return D
+def set_atommap(mol, num=0):
+    for atom in mol.GetAtoms():
+        atom.SetAtomMapNum(num)
+def get_mol(smiles):
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        return None
+    Chem.Kekulize(mol)
+    return mol
+def get_smiles(mol):
+    return Chem.MolToSmiles(mol, kekuleSmiles=False)
+def decode_stereo(smiles2D):
+    mol = Chem.MolFromSmiles(smiles2D)
+    dec_isomers = list(EnumerateStereoisomers(mol))
+    dec_isomers = [Chem.MolFromSmiles(Chem.MolToSmiles(mol, isomericSmiles=True)) for mol in dec_isomers]
+    smiles3D = [Chem.MolToSmiles(mol, isomericSmiles=True) for mol in dec_isomers]
+    chiralN = [atom.GetIdx() for atom in dec_isomers[0].GetAtoms() if
+               int(atom.GetChiralTag()) > 0 and atom.GetSymbol() == "N"]
+    if len(chiralN) > 0:
+        for mol in dec_isomers:
+            for idx in chiralN:
+                mol.GetAtomWithIdx(idx).SetChiralTag(Chem.rdchem.ChiralType.CHI_UNSPECIFIED)
+            smiles3D.append(Chem.MolToSmiles(mol, isomericSmiles=True))
+    return smiles3D
+def sanitize(mol):
+    try:
+        smiles = get_smiles(mol)
+        mol = get_mol(smiles)
+    except Exception as e:
+        return None
+    return mol
+def copy_atom(atom):
+    new_atom = Chem.Atom(atom.GetSymbol())
+    new_atom.SetFormalCharge(atom.GetFormalCharge())
+    new_atom.SetAtomMapNum(atom.GetAtomMapNum())
+    return new_atom
+def copy_edit_mol(mol):
+    new_mol = Chem.RWMol(Chem.MolFromSmiles(''))
+    for atom in mol.GetAtoms():
+        new_atom = copy_atom(atom)
+        new_mol.AddAtom(new_atom)
+    for bond in mol.GetBonds():
+        a1 = bond.GetBeginAtom().GetIdx()
+        a2 = bond.GetEndAtom().GetIdx()
+        bt = bond.GetBondType()
+        new_mol.AddBond(a1, a2, bt)
+    return new_mol
+def get_submol(mol, idxs, mark=[]):
+    new_mol = Chem.RWMol(Chem.MolFromSmiles(''))
+    map = {}
+    for atom in mol.GetAtoms():
+        if atom.GetIdx() in idxs:
+            new_atom = copy_atom(atom)
+            if atom.GetIdx() in mark:
+                new_atom.SetAtomMapNum(1)
+            else:
+                new_atom.SetAtomMapNum(0)
+            map[atom.GetIdx()] = new_mol.AddAtom(new_atom)
+    for bond in mol.GetBonds():
+        a1 = bond.GetBeginAtom().GetIdx()
+        a2 = bond.GetEndAtom().GetIdx()
+        if a1 in idxs and a2 in idxs:
+            bt = bond.GetBondType()
+            new_mol.AddBond(map[a1], map[a2], bt)
+    return new_mol.GetMol()
+def get_clique_mol(mol, atoms):
+    smiles = Chem.MolFragmentToSmiles(mol, atoms, kekuleSmiles=True)
+    new_mol = Chem.MolFromSmiles(smiles, sanitize=False)
+    new_mol = copy_edit_mol(new_mol).GetMol()
+    new_mol = sanitize(new_mol)  # We assume this is not None
+    return new_mol
+def get_clique_mol_simple(mol, cluster):
+    smile_cluster = Chem.MolFragmentToSmiles(mol, cluster, canonical=True, kekuleSmiles=True)
+    mol_cluster = Chem.MolFromSmiles(smile_cluster, sanitize=False)
+    return mol_cluster
+def tree_decomp(mol, reference_vocab=None):
+    edges = defaultdict(int)
+    n_atoms = mol.GetNumAtoms()
+    clusters = []
+    for bond in mol.GetBonds():
+        a1 = bond.GetBeginAtom().GetIdx()
+        a2 = bond.GetEndAtom().GetIdx()
+        if not bond.IsInRing():
+            clusters.append({a1, a2})
+    # extract rotatable bonds
+    ssr = [set(x) for x in Chem.GetSymmSSSR(mol)]
+    # remove too large circles
+    ssr = [x for x in ssr if len(x) <= 8]
+    # Merge Rings with intersection >= 2 atoms
+    # check the reference_vocab if it is not None
+    for i in range(len(ssr)-1):
+        if len(ssr[i]) <= 2:
+            continue
+        for j in range(i+1, len(ssr)):
+            if len(ssr[j]) <= 2:
+                continue
+            inter = ssr[i] & ssr[j]
+            if reference_vocab is not None:
+                if len(inter) >= 2:
+                    merge = ssr[i] | ssr[j]
+                    smile_merge = Chem.MolFragmentToSmiles(mol, merge, canonical=True, kekuleSmiles=True)
+                    if reference_vocab[smile_merge] <= 100 and len(inter) == 2:
+                        continue
+                    ssr[i] = merge
+                    ssr[j] = set()
+            else:
+                if len(inter) > 2:
+                    merge = ssr[i] | ssr[j]
+                    ssr[i] = merge
+                    ssr[j] = set()
+    ssr = [c for c in ssr if len(c) > 0]
+    clusters.extend(ssr)
+    nei_list = [[] for _ in range(n_atoms)]
+    for i in range(len(clusters)):
+        for atom in clusters[i]:
+            nei_list[atom].append(i)
+    # Build edges
+    for atom in range(n_atoms):
+        if len(nei_list[atom]) <= 1:
+            continue
+        cnei = nei_list[atom]
+        for i in range(len(cnei)):
+            for j in range(i + 1, len(cnei)):
+                c1, c2 = cnei[i], cnei[j]
+                inter = set(clusters[c1]) & set(clusters[c2])
+                if edges[(c1, c2)] < len(inter):
+                    edges[(c1, c2)] = len(inter)  # cnei[i] < cnei[j] by construction
+    edges = [u + (MST_MAX_WEIGHT - v,) for u, v in edges.items()]
+    if len(edges) == 0:
+        return clusters, edges
+    # Compute Maximum Spanning Tree
+    row, col, data = zip(*edges)
+    n_clique = len(clusters)
+    clique_graph = csr_matrix((data, (row, col)), shape=(n_clique, n_clique))
+    junc_tree = minimum_spanning_tree(clique_graph)
+    row, col = junc_tree.nonzero()
+    edges = [(row[i], col[i]) for i in range(len(row))]
+    return clusters, edges
+def atom_equal(a1, a2):
+    return a1.GetSymbol() == a2.GetSymbol() and a1.GetFormalCharge() == a2.GetFormalCharge()
+# Bond type not considered because all aromatic (so SINGLE matches DOUBLE)
+def ring_bond_equal(bond1, bond2, reverse=False):
+    b1 = (bond1.GetBeginAtom(), bond1.GetEndAtom())
+    if reverse:
+        b2 = (bond2.GetEndAtom(), bond2.GetBeginAtom())
+    else:
+        b2 = (bond2.GetBeginAtom(), bond2.GetEndAtom())
+    return atom_equal(b1[0], b2[0]) and atom_equal(b1[1], b2[1]) and bond1.GetBondType() == bond2.GetBondType()
+def attach(ctr_mol, nei_mol, amap):
+    ctr_mol = Chem.RWMol(ctr_mol)
+    for atom in nei_mol.GetAtoms():
+        if atom.GetIdx() not in amap:
+            new_atom = copy_atom(atom)
+            new_atom.SetAtomMapNum(2)
+            amap[atom.GetIdx()] = ctr_mol.AddAtom(new_atom)
+    for bond in nei_mol.GetBonds():
+        a1 = amap[bond.GetBeginAtom().GetIdx()]
+        a2 = amap[bond.GetEndAtom().GetIdx()]
+        if ctr_mol.GetBondBetweenAtoms(a1, a2) is None:
+            ctr_mol.AddBond(a1, a2, bond.GetBondType())
+    return ctr_mol.GetMol(), amap
+def attach_mols(ctr_mol, neighbors, prev_nodes, nei_amap):
+    prev_nids = [node.nid for node in prev_nodes]
+    for nei_node in prev_nodes + neighbors:
+        nei_id, nei_mol = nei_node.nid, nei_node.mol
+        amap = nei_amap[nei_id]
+        for atom in nei_mol.GetAtoms():
+            if atom.GetIdx() not in amap:
+                new_atom = copy_atom(atom)
+                amap[atom.GetIdx()] = ctr_mol.AddAtom(new_atom)
+        if nei_mol.GetNumBonds() == 0:
+            nei_atom = nei_mol.GetAtomWithIdx(0)
+            ctr_atom = ctr_mol.GetAtomWithIdx(amap[0])
+            ctr_atom.SetAtomMapNum(nei_atom.GetAtomMapNum())
+        else:
+            for bond in nei_mol.GetBonds():
+                a1 = amap[bond.GetBeginAtom().GetIdx()]
+                a2 = amap[bond.GetEndAtom().GetIdx()]
+                if ctr_mol.GetBondBetweenAtoms(a1, a2) is None:
+                    ctr_mol.AddBond(a1, a2, bond.GetBondType())
+                elif nei_id in prev_nids:  # father node overrides
+                    ctr_mol.RemoveBond(a1, a2)
+                    ctr_mol.AddBond(a1, a2, bond.GetBondType())
+    return ctr_mol
+def local_attach(ctr_mol, neighbors, prev_nodes, amap_list):
+    ctr_mol = copy_edit_mol(ctr_mol)
+    nei_amap = {nei.nid: {} for nei in prev_nodes + neighbors}
+    for nei_id, ctr_atom, nei_atom in amap_list:
+        nei_amap[nei_id][nei_atom] = ctr_atom
+    ctr_mol = attach_mols(ctr_mol, neighbors, prev_nodes, nei_amap)
+    return ctr_mol.GetMol()
+# This version records idx mapping between ctr_mol and nei_mol
+def enum_attach(ctr_mol, nei_mol):
+    try:
+        Chem.Kekulize(ctr_mol)
+        Chem.Kekulize(nei_mol)
+    except:
+        return []
+    att_confs = []
+    valence_ctr = {i: 0 for i in range(ctr_mol.GetNumAtoms())}
+    valence_nei = {i: 0 for i in range(nei_mol.GetNumAtoms())}
+    ctr_bonds = [bond for bond in ctr_mol.GetBonds() if bond.GetBeginAtom().GetAtomMapNum() == 1 and bond.GetEndAtom().GetAtomMapNum() == 1]
+    ctr_atoms = [atom for atom in ctr_mol.GetAtoms() if atom.GetAtomMapNum() == 1]
+    if nei_mol.GetNumBonds() == 1:  # neighbor is a bond
+        bond = nei_mol.GetBondWithIdx(0)
+        #bond_val = int(bond.GetBondType())
+        bond_val = int(bond.GetBondTypeAsDouble())
+        b1, b2 = bond.GetBeginAtom(), bond.GetEndAtom()
+        for atom in ctr_atoms:
+            # Optimize if atom is carbon (other atoms may change valence)
+            if atom.GetAtomicNum() == 6 and atom.GetTotalNumHs() < bond_val:
+                continue
+            if atom_equal(atom, b1):
+                new_amap = {b1.GetIdx(): atom.GetIdx()}
+                att_confs.append(new_amap)
+            elif atom_equal(atom, b2):
+                new_amap = {b2.GetIdx(): atom.GetIdx()}
+                att_confs.append(new_amap)
+    else:
+        # intersection is an atom
+        for a1 in ctr_atoms:
+            for a2 in nei_mol.GetAtoms():
+                if atom_equal(a1, a2):
+                    # Optimize if atom is carbon (other atoms may change valence)
+                    if a1.GetAtomicNum() == 6 and a1.GetTotalNumHs() + a2.GetTotalNumHs() < 4:
+                        continue
+                    amap = {a2.GetIdx(): a1.GetIdx()}
+                    att_confs.append(amap)
+        # intersection is an bond
+        if ctr_mol.GetNumBonds() > 1:
+            for b1 in ctr_bonds:
+                for b2 in nei_mol.GetBonds():
+                    if ring_bond_equal(b1, b2):
+                        amap = {b2.GetBeginAtom().GetIdx(): b1.GetBeginAtom().GetIdx(),
+                                b2.GetEndAtom().GetIdx(): b1.GetEndAtom().GetIdx()}
+                        att_confs.append(amap)
+                    if ring_bond_equal(b1, b2, reverse=True):
+                        amap = {b2.GetEndAtom().GetIdx(): b1.GetBeginAtom().GetIdx(),
+                                b2.GetBeginAtom().GetIdx(): b1.GetEndAtom().GetIdx()}
+                        att_confs.append(amap)
+    return att_confs
+def enumerate_assemble(mol, idxs, current, next):
+    ctr_mol = get_submol(mol, idxs, mark=current.clique)
+    ground_truth = get_submol(mol, list(set(idxs) | set(next.clique)))
+    # submol can also obtained with get_clique_mol, future exploration
+    ground_truth_smiles = get_smiles(ground_truth)
+    cand_smiles = []
+    cand_mols = []
+    cand_amap = enum_attach(ctr_mol, next.mol)
+    for amap in cand_amap:
+        try:
+            cand_mol, _ = attach(ctr_mol, next.mol, amap)
+            cand_mol = sanitize(cand_mol)
+        except:
+            continue
+        if cand_mol is None:
+            continue
+        smiles = get_smiles(cand_mol)
+        if smiles in cand_smiles or smiles == ground_truth_smiles:
+            continue
+        cand_smiles.append(smiles)
+        cand_mols.append(cand_mol)
+    if len(cand_mols) >= 1:
+        cand_mols = sample(cand_mols, 1)
+        cand_mols.append(ground_truth)
+        labels = torch.tensor([0, 1])
+    else:
+        cand_mols = [ground_truth]
+        labels = torch.tensor([1])
+    return labels, cand_mols
+# allowable node and edge features
+allowable_features = {
+    'possible_atomic_num_list' : list(range(1, 119)),
+    'possible_formal_charge_list' : [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5],
+    'possible_chirality_list' : [
+        Chem.rdchem.ChiralType.CHI_UNSPECIFIED,
+        Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW,
+        Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW,
+        Chem.rdchem.ChiralType.CHI_OTHER
+    ],
+    'possible_hybridization_list' : [
+        Chem.rdchem.HybridizationType.S,
+        Chem.rdchem.HybridizationType.SP, Chem.rdchem.HybridizationType.SP2,
+        Chem.rdchem.HybridizationType.SP3, Chem.rdchem.HybridizationType.SP3D,
+        Chem.rdchem.HybridizationType.SP3D2, Chem.rdchem.HybridizationType.UNSPECIFIED
+    ],
+    'possible_numH_list' : [0, 1, 2, 3, 4, 5, 6, 7, 8],
+    'possible_implicit_valence_list' : [0, 1, 2, 3, 4, 5, 6],
+    'possible_degree_list' : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+    'possible_bonds' : [
+        Chem.rdchem.BondType.SINGLE,
+        Chem.rdchem.BondType.DOUBLE,
+        Chem.rdchem.BondType.TRIPLE,
+        Chem.rdchem.BondType.AROMATIC
+    ],
+    'possible_bond_dirs' : [ # only for double bond stereo information
+        Chem.rdchem.BondDir.NONE,
+        Chem.rdchem.BondDir.ENDUPRIGHT,
+        Chem.rdchem.BondDir.ENDDOWNRIGHT
+    ]
+}
+def mol_to_graph_data_obj_simple(mol):
+    """
+    Converts rdkit mol object to graph Data object required by the pytorch
+    geometric package. NB: Uses simplified atom and bond features, and represent
+    as indices
+    :param mol: rdkit mol object
+    :return: graph data object with the attributes: x, edge_index, edge_attr
+    """
+    # atoms
+    num_atom_features = 2   # atom type,  chirality tag
+    atom_features_list = []
+    for atom in mol.GetAtoms():
+        atom_feature = [allowable_features['possible_atomic_num_list'].index(
+            atom.GetAtomicNum())] + [allowable_features[
+            'possible_chirality_list'].index(atom.GetChiralTag())]
+        atom_features_list.append(atom_feature)
+    x = torch.tensor(np.array(atom_features_list), dtype=torch.long)
+    # bonds
+    num_bond_features = 2   # bond type, bond direction
+    if len(mol.GetBonds()) > 0: # mol has bonds
+        edges_list = []
+        edge_features_list = []
+        for bond in mol.GetBonds():
+            i = bond.GetBeginAtomIdx()
+            j = bond.GetEndAtomIdx()
+            edge_feature = [allowable_features['possible_bonds'].index(
+                bond.GetBondType())] + [allowable_features[
+                                            'possible_bond_dirs'].index(
+                bond.GetBondDir())]
+            edges_list.append((i, j))
+            edge_features_list.append(edge_feature)
+            edges_list.append((j, i))
+            edge_features_list.append(edge_feature)
+        # data.edge_index: Graph connectivity in COO format with shape [2, num_edges]
+        edge_index = torch.tensor(np.array(edges_list).T, dtype=torch.long)
+        # data.edge_attr: Edge feature matrix with shape [num_edges, num_edge_features]
+        edge_attr = torch.tensor(np.array(edge_features_list),
+                                 dtype=torch.long)
+    else:   # mol has no bonds
+        edge_index = torch.empty((2, 0), dtype=torch.long)
+        edge_attr = torch.empty((0, num_bond_features), dtype=torch.long)
+    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
+    return data
+# For inference
+def assemble(mol_list, next_motif_smiles):
+    attach_fail = torch.zeros(len(mol_list)).bool()
+    cand_mols, cand_batch, new_atoms, cand_smiles, one_atom_attach, intersection = [], [], [], [], [], []
+    for i in range(len(mol_list)):
+        next = Chem.MolFromSmiles(next_motif_smiles[i])
+        cand_amap = enum_attach(mol_list[i], next)
+        if len(cand_amap) == 0:
+            attach_fail[i] = True
+            cand_mols.append(mol_list[i])
+            cand_batch.append(i)
+            one_atom_attach.append(-1)
+            intersection.append([])
+            new_atoms.append([])
+        else:
+            valid_cand = 0
+            for amap in cand_amap:
+                amap_len = len(amap)
+                iter_atoms = [v for v in amap.values()]
+                ctr_mol = deepcopy(mol_list[i])
+                cand_mol, amap1 = attach(ctr_mol, next, amap)
+                if sanitize(deepcopy(cand_mol)) is None:
+                    continue
+                smiles = get_smiles(cand_mol)
+                cand_smiles.append(smiles)
+                cand_mols.append(cand_mol)
+                cand_batch.append(i)
+                new_atoms.append([v for v in amap1.values()])
+                one_atom_attach.append(amap_len)
+                intersection.append(iter_atoms)
+                valid_cand+=1
+            if valid_cand==0:
+                attach_fail[i] = True
+                cand_mols.append(mol_list[i])
+                cand_batch.append(i)
+                one_atom_attach.append(-1)
+                intersection.append([])
+                new_atoms.append([])
+    cand_batch = torch.tensor(cand_batch)
+    one_atom_attach = torch.tensor(one_atom_attach) == 1
+    return cand_mols, cand_batch, new_atoms, one_atom_attach, intersection, attach_fail
+if __name__ == "__main__":
+    import sys
+    from mol_tree import MolTree
+    lg = rdkit.RDLogger.logger()
+    lg.setLevel(rdkit.RDLogger.CRITICAL)
+    smiles = ["O=C1[C@@H]2C=C[C@@H](C=CC2)C1(c1ccccc1)c1ccccc1", "O=C([O-])CC[C@@]12CCCC[C@]1(O)OC(=O)CC2",
+              "ON=C1C[C@H]2CC3(C[C@@H](C1)c1ccccc12)OCCO3",
+              "C[C@H]1CC(=O)[C@H]2[C@@]3(O)C(=O)c4cccc(O)c4[C@@H]4O[C@@]43[C@@H](O)C[C@]2(O)C1",
+              'Cc1cc(NC(=O)CSc2nnc3c4ccccc4n(C)c3n2)ccc1Br', 'CC(C)(C)c1ccc(C(=O)N[C@H]2CCN3CCCc4cccc2c43)cc1',
+              "O=c1c2ccc3c(=O)n(-c4nccs4)c(=O)c4ccc(c(=O)n1-c1nccs1)c2c34", "O=C(N1CCc2c(F)ccc(F)c2C1)C1(O)Cc2ccccc2C1"]
+    mol_tree = MolTree("C")
+    assert len(mol_tree.nodes) > 0
+    def count():
+        cnt, n = 0, 0
+        for s in sys.stdin:
+            s = s.split()[0]
+            tree = MolTree(s)
+            tree.recover()
+            tree.assemble()
+            for node in tree.nodes:
+                cnt += len(node.cands)
+            n += len(tree.nodes)
+            # print cnt * 1.0 / n
+    count()

utils/data.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import copy
+import torch
+import numpy as np
+from torch_geometric.data import Data, Batch
+# from torch_geometric.loader import DataLoader
+from torch.utils.data import Dataset
+FOLLOW_BATCH = ['protein_element', 'ligand_context_element', 'pos_real', 'pos_fake']
+class ProteinLigandData(object):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    @staticmethod
+    def from_protein_ligand_dicts(protein_dict=None, ligand_dict=None, **kwargs):
+        instance = ProteinLigandData(**kwargs)
+        if protein_dict is not None:
+            for key, item in protein_dict.items():
+                instance['protein_' + key] = item
+        if ligand_dict is not None:
+            for key, item in ligand_dict.items():
+                if key == 'moltree':
+                    instance['moltree'] = item
+                else:
+                    instance['ligand_' + key] = item
+        # instance['ligand_nbh_list'] = {i.item():[j.item() for k, j in enumerate(instance.ligand_bond_index[1]) if instance.ligand_bond_index[0, k].item() == i] for i in instance.ligand_bond_index[0]}
+        return instance
+def batch_from_data_list(data_list):
+    return Batch.from_data_list(data_list, follow_batch=['ligand_element', 'protein_element'])
+def torchify_dict(data):
+    output = {}
+    for k, v in data.items():
+        if isinstance(v, np.ndarray):
+            output[k] = torch.from_numpy(v)
+        else:
+            output[k] = v
+    return output
+def collate_mols(mol_dicts):
+    data_batch = {}
+    batch_size = len(mol_dicts)
+    for key in ['protein_pos', 'protein_atom_feature', 'ligand_context_pos', 'ligand_context_feature_full',
+                'ligand_frontier', 'num_atoms', 'next_wid', 'current_wid', 'current_atoms', 'cand_labels',
+                'ligand_pos_torsion', 'ligand_feature_torsion', 'true_sin', 'true_cos', 'true_three_hop',
+                'dihedral_mask', 'protein_contact', 'true_dm', 'alpha_carbon_indicator']:
+        data_batch[key] = torch.cat([mol_dict[key] for mol_dict in mol_dicts], dim=0)
+    # unsqueeze dim0
+    for key in ['xn_pos', 'yn_pos', 'ligand_torsion_xy_index', 'y_pos']:
+        cat_list = [mol_dict[key].unsqueeze(0) for mol_dict in mol_dicts if len(mol_dict[key]) > 0]
+        if len(cat_list) > 0:
+            data_batch[key] = torch.cat(cat_list, dim=0)
+        else:
+            data_batch[key] = torch.tensor([])
+    # follow batch
+    for key in ['protein_element', 'ligand_context_element', 'current_atoms']:
+        repeats = torch.tensor([len(mol_dict[key]) for mol_dict in mol_dicts])
+        data_batch[key + '_batch'] = torch.repeat_interleave(torch.arange(batch_size), repeats)
+    for key in ['ligand_element_torsion']:
+        repeats = torch.tensor([len(mol_dict[key]) for mol_dict in mol_dicts if len(mol_dict[key]) > 0])
+        if len(repeats) > 0:
+            data_batch[key + '_batch'] = torch.repeat_interleave(torch.arange(len(repeats)), repeats)
+        else:
+            data_batch[key + '_batch'] = torch.tensor([])
+    # distance matrix prediction
+    p_idx, q_idx = torch.cartesian_prod(torch.arange(4), torch.arange(2)).chunk(2, dim=-1)
+    p_idx, q_idx = p_idx.squeeze(-1), q_idx.squeeze(-1)
+    protein_offsets = torch.cumsum(data_batch['protein_element_batch'].bincount(), dim=0)
+    ligand_offsets = torch.cumsum(data_batch['ligand_context_element_batch'].bincount(), dim=0)
+    protein_offsets, ligand_offsets = torch.cat([torch.tensor([0]), protein_offsets]), torch.cat([torch.tensor([0]), ligand_offsets])
+    ligand_idx, protein_idx = [], []
+    for i, mol_dict in enumerate(mol_dicts):
+        if len(mol_dict['true_dm']) > 0:
+            protein_idx.append(mol_dict['dm_protein_idx'][p_idx] + protein_offsets[i])
+            ligand_idx.append(mol_dict['dm_ligand_idx'][q_idx] + ligand_offsets[i])
+    if len(ligand_idx) > 0:
+        data_batch['dm_ligand_idx'], data_batch['dm_protein_idx'] = torch.cat(ligand_idx), torch.cat(protein_idx)
+    # structure refinement (alpha carbon - ligand atom)
+    sr_ligand_idx, sr_protein_idx = [], []
+    for i, mol_dict in enumerate(mol_dicts):
+        if len(mol_dict['true_dm']) > 0:
+            ligand_atom_index = torch.arange(len(mol_dict['ligand_context_pos']))
+            p_idx, q_idx = torch.cartesian_prod(torch.arange(len(mol_dict['ligand_context_pos'])), torch.arange(len(mol_dict['protein_alpha_carbon_index']))).chunk(2, dim=-1)
+            p_idx, q_idx = p_idx.squeeze(-1), q_idx.squeeze(-1)
+            sr_ligand_idx.append(ligand_atom_index[p_idx] + ligand_offsets[i])
+            sr_protein_idx.append(mol_dict['protein_alpha_carbon_index'][q_idx] + protein_offsets[i])
+    if len(sr_ligand_idx) > 0:
+        data_batch['sr_ligand_idx'], data_batch['sr_protein_idx'] = torch.cat(sr_ligand_idx).long(), torch.cat(sr_protein_idx).long()
+    # structure refinement (ligand atom - ligand atom)
+    sr_ligand_idx0, sr_ligand_idx1 = [], []
+    for i, mol_dict in enumerate(mol_dicts):
+        if len(mol_dict['true_dm']) > 0:
+            ligand_atom_index = torch.arange(len(mol_dict['ligand_context_pos']))
+            p_idx, q_idx = torch.cartesian_prod(torch.arange(len(mol_dict['ligand_context_pos'])), torch.arange(len(mol_dict['ligand_context_pos']))).chunk(2, dim=-1)
+            p_idx, q_idx = p_idx.squeeze(-1), q_idx.squeeze(-1)
+            sr_ligand_idx0.append(ligand_atom_index[p_idx] + ligand_offsets[i])
+            sr_ligand_idx1.append(ligand_atom_index[q_idx] + ligand_offsets[i])
+    if len(ligand_idx) > 0:
+        data_batch['sr_ligand_idx0'], data_batch['sr_ligand_idx1'] = torch.cat(sr_ligand_idx0).long(), torch.cat(sr_ligand_idx1).long()
+    # index
+    if len(data_batch['y_pos']) > 0:
+        repeats = torch.tensor([len(mol_dict['ligand_element_torsion']) for mol_dict in mol_dicts if len(mol_dict['ligand_element_torsion']) > 0])
+        offsets = torch.cat([torch.tensor([0]), torch.cumsum(repeats, dim=0)])[:-1]
+        data_batch['ligand_torsion_xy_index'] += offsets.unsqueeze(1)
+    offsets1 = torch.cat([torch.tensor([0]), torch.cumsum(data_batch['num_atoms'], dim=0)])[:-1]
+    data_batch['current_atoms'] += torch.repeat_interleave(offsets1, data_batch['current_atoms_batch'].bincount())
+    # cand mols: torch geometric Data
+    cand_mol_list = []
+    for data in mol_dicts:
+        if len(data['cand_labels']) > 0:
+            cand_mol_list.extend(data['cand_mols'])
+    if len(cand_mol_list) > 0:
+        data_batch['cand_mols'] = Batch.from_data_list(cand_mol_list)
+    return data_batch

utils/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+from torch.utils.data import Subset
+from .pl import PocketLigandPairDataset
+import random
+def get_dataset(config, *args, **kwargs):
+    name = config.name
+    root = config.path
+    if name == 'pl':
+        dataset = PocketLigandPairDataset(root, *args, **kwargs)
+    else:
+        raise NotImplementedError('Unknown dataset: %s' % name)
+    if 'split' in config:
+        split_by_name = torch.load(config.split)
+        split = {k: [dataset.name2id[n] for n in names if n in dataset.name2id] for k, names in split_by_name.items()}
+        subsets = {k:Subset(dataset, indices=v) for k, v in split.items()}
+        return dataset, subsets
+    else:
+        return dataset

utils/datasets/pl.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import os
+import pickle
+import lmdb
+import torch
+from torch.utils.data import Dataset
+from tqdm.auto import tqdm
+import numpy as np
+from ..protein_ligand import PDBProtein, parse_sdf_file
+from ..data import ProteinLigandData, torchify_dict
+from ..mol_tree import MolTree
+def reset_moltree_root(moltree, ligand_pos, protein_pos):
+    ligand2 = np.sum(np.square(ligand_pos), 1, keepdims=True)
+    protein2 = np.sum(np.square(protein_pos), 1, keepdims=True)
+    dist = np.add(np.add(-2 * np.dot(ligand_pos, protein_pos.T), ligand2), protein2.T)
+    min_dist = np.min(dist, 1)
+    avg_min_dist = []
+    for node in moltree.nodes:
+        avg_min_dist.append(np.min(min_dist[node.clique]))
+    root = np.argmin(avg_min_dist)
+    if root > 0:
+        moltree.nodes[0], moltree.nodes[root] = moltree.nodes[root], moltree.nodes[0]
+    contact_idx = np.argmin(np.min(dist[moltree.nodes[0].clique], 0))
+    contact_protein = torch.tensor(np.min(dist, 0) < 4 ** 2)
+    return moltree, contact_protein, torch.tensor([contact_idx])
+def from_protein_ligand_dicts(protein_dict=None, ligand_dict=None):
+    instance = {}
+    if protein_dict is not None:
+        for key, item in protein_dict.items():
+            instance['protein_' + key] = item
+    if ligand_dict is not None:
+        for key, item in ligand_dict.items():
+            if key == 'moltree':
+                instance['moltree'] = item
+            else:
+                instance['ligand_' + key] = item
+    return instance
+class PocketLigandPairDataset(Dataset):
+    def __init__(self, raw_path, transform=None):
+        super().__init__()
+        self.raw_path = raw_path.rstrip('/')
+        self.index_path = os.path.join(self.raw_path, 'index.pt')
+        self.processed_path = os.path.join(os.path.dirname(self.raw_path),
+                                           os.path.basename(self.raw_path) + '_processed.lmdb')
+        self.name2id_path = os.path.join(os.path.dirname(self.raw_path),
+                                         os.path.basename(self.raw_path) + '_name2id.pt')
+        self.transform = transform
+        self.db = None
+        self.keys = None
+        if not os.path.exists(self.processed_path):
+            self._process()
+            self._precompute_name2id()
+        self.name2id = torch.load(self.name2id_path)
+    def _connect_db(self):
+        """
+            Establish read-only database connection
+        """
+        assert self.db is None, 'A connection has already been opened.'
+        self.db = lmdb.open(
+            self.processed_path,
+            map_size=10 * (1024 * 1024 * 1024),  # 10GB
+            create=False,
+            subdir=False,
+            readonly=True,
+            lock=False,
+            readahead=False,
+            meminit=False,
+        )
+        with self.db.begin() as txn:
+            self.keys = list(txn.cursor().iternext(values=False))
+    def _close_db(self):
+        self.db.close()
+        self.db = None
+        self.keys = None
+    def _process(self):
+        db = lmdb.open(
+            self.processed_path,
+            map_size=10 * (1024 * 1024 * 1024),  # 10GB
+            create=True,
+            subdir=False,
+            readonly=False,  # Writable
+        )
+        #with open(self.index_path, 'rb') as f:
+            #index = pickle.load(f)
+        index = torch.load(self.index_path)
+        vocab = []
+        for line in open('./vocab.txt'):
+            p, _, _ = line.partition(':')
+            vocab.append(p)
+        num_skipped = 0
+        with db.begin(write=True, buffers=True) as txn:
+            for i, pdbid in enumerate(tqdm(index)):
+                if pdbid is None: continue
+                try:
+                    ligand_fn = os.path.join(pdbid, pdbid + '_ligand.sdf')
+                    pocket_fn = os.path.join(pdbid, pdbid + '_pocket.pdb')
+                    pocket_dict = PDBProtein(os.path.join(self.raw_path, pocket_fn)).to_dict_atom()
+                    ligand_dict = parse_sdf_file(os.path.join(self.raw_path, ligand_fn))
+                    ligand_dict['moltree'], pocket_dict['contact'], pocket_dict['contact_idx'] = reset_moltree_root(
+                        ligand_dict['moltree'],
+                        ligand_dict['pos'],
+                        pocket_dict['pos'])
+                    data = from_protein_ligand_dicts(
+                        protein_dict=torchify_dict(pocket_dict),
+                        ligand_dict=torchify_dict(ligand_dict),
+                    )
+                    data['protein_filename'] = pocket_fn
+                    data['ligand_filename'] = ligand_fn
+                    data['pdbid'] = pdbid
+                    txn.put(
+                        key=str(i).encode(),
+                        value=pickle.dumps(data)
+                    )
+                    for c in ligand_dict['moltree'].nodes:
+                        smile_cluster = c.smiles
+                        assert smile_cluster in vocab
+                except:
+                    num_skipped += 1
+                    print('Skipping (%d) %s' % (num_skipped, ligand_fn,))
+                    continue
+        db.close()
+    def _precompute_name2id(self):
+        name2id = {}
+        for i in tqdm(range(self.__len__()), 'Indexing'):
+            try:
+                data = self.__getitem__(i)
+            except AssertionError as e:
+                print(i, e)
+                continue
+            name = data['pdbid']
+            name2id[name] = i
+        torch.save(name2id, self.name2id_path)
+    def __len__(self):
+        if self.db is None:
+            self._connect_db()
+        return len(self.keys)
+    def __getitem__(self, idx):
+        if self.db is None:
+            self._connect_db()
+        key = self.keys[idx]
+        data = pickle.loads(self.db.begin().get(key))
+        data['id'] = idx
+        assert data['protein_pos'].size(0) > 0
+        if self.transform is not None:
+            data = self.transform(data)
+        return data
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('path', type=str)
+    args = parser.parse_args()
+    PocketLigandPairDataset(args.path)

utils/dihedral_utils.py ADDED Viewed

	@@ -0,0 +1,383 @@

+import torch
+import torch_geometric as tg
+from torch_geometric.utils import degree
+import networkx as nx
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+angle_mask_ref = torch.LongTensor([[0, 0, 0, 0, 0, 0],
+                                   [0, 0, 0, 0, 0, 0],
+                                   [1, 0, 0, 0, 0, 0],
+                                   [1, 1, 1, 0, 0, 0],
+                                   [1, 1, 1, 1, 1, 1]]).to(device)
+angle_combos = torch.LongTensor([[0, 1],
+                                 [0, 2],
+                                 [1, 2],
+                                 [0, 3],
+                                 [1, 3],
+                                 [2, 3]]).to(device)
+def get_neighbor_ids(data):
+    """
+    Takes the edge indices and returns dictionary mapping atom index to neighbor indices
+    Note: this only includes atoms with degree > 1
+    """
+    # start, end = edge_index
+    # idxs, vals = torch.unique(start, return_counts=True)
+    # vs = torch.split_with_sizes(end, tuple(vals))
+    # return {k.item(): v for k, v in zip(idxs, vs) if len(v) > 1}
+    neighbors = data.neighbors.pop(0)
+    n_atoms_per_mol = data.batch.bincount()
+    n_atoms_prev_mol = 0
+    for i, n_dict in enumerate(data.neighbors):
+        new_dict = {}
+        n_atoms_prev_mol += n_atoms_per_mol[i].item()
+        for k, v in n_dict.items():
+            new_dict[k + n_atoms_prev_mol] = v + n_atoms_prev_mol
+        neighbors.update(new_dict)
+    return neighbors
+def get_neighbor_bonds(edge_index, bond_type):
+    """
+    Takes the edge indices and bond type and returns dictionary mapping atom index to neighbor bond types
+    Note: this only includes atoms with degree > 1
+    """
+    start, end = edge_index
+    idxs, vals = torch.unique(start, return_counts=True)
+    vs = torch.split_with_sizes(bond_type, tuple(vals))
+    return {k.item(): v for k, v in zip(idxs, vs) if len(v) > 1}
+def get_leaf_hydrogens(neighbors, x):
+    """
+    Takes the edge indices and atom features and returns dictionary mapping atom index to neighbors, indicating true
+    for hydrogens that are leaf nodes
+    Note: this only works because degree = 1 and hydrogen atomic number = 1 (checks when 1 == 1)
+    Note: we use the 5th feature index bc this corresponds to the atomic number
+    """
+    # start, end = edge_index
+    # degrees = degree(end)
+    # idxs, vals = torch.unique(start, return_counts=True)
+    # vs = torch.split_with_sizes(end, tuple(vals))
+    # return {k.item(): degrees[v] == x[v, 5] for k, v in zip(idxs, vs) if len(v) > 1}
+    leaf_hydrogens = {}
+    h_mask = x[:, 0] == 1
+    for k, v in neighbors.items():
+        leaf_hydrogens[k] = h_mask[neighbors[k]]
+    return leaf_hydrogens
+def get_dihedral_pairs(edge_index, data):
+    """
+    Given edge indices, return pairs of indices that we must calculate dihedrals for
+    """
+    start, end = edge_index
+    degrees = degree(end)
+    dihedral_pairs_true = torch.nonzero(torch.logical_and(degrees[start] > 1, degrees[end] > 1))
+    dihedral_pairs = edge_index[:, dihedral_pairs_true].squeeze(-1)
+    # # first method which removes one (pseudo) random edge from a cycle
+    dihedral_idxs = torch.nonzero(dihedral_pairs.sort(dim=0).indices[0, :] == 0).squeeze().detach().cpu().numpy()
+    # prioritize rings for assigning dihedrals
+    dihedral_pairs = dihedral_pairs.t()[dihedral_idxs]
+    G = nx.to_undirected(tg.utils.to_networkx(data))
+    cycles = nx.cycle_basis(G)
+    keep, sorted_keep = [], []
+    if len(dihedral_pairs.shape) == 1:
+        dihedral_pairs = dihedral_pairs.unsqueeze(0)
+    for pair in dihedral_pairs:
+        x, y = pair
+        if sorted(pair) in sorted_keep:
+            continue
+        y_cycle_check = [y in cycle for cycle in cycles]
+        x_cycle_check = [x in cycle for cycle in cycles]
+        if any(x_cycle_check) and any(y_cycle_check):  # both in new cycle
+            cycle_indices = get_current_cycle_indices(cycles, x_cycle_check, x)
+            keep.extend(cycle_indices)
+            sorted_keep.extend([sorted(c) for c in cycle_indices])
+            continue
+        if any(y_cycle_check):
+            cycle_indices = get_current_cycle_indices(cycles, y_cycle_check, y)
+            keep.append(pair)
+            keep.extend(cycle_indices)
+            sorted_keep.append(sorted(pair))
+            sorted_keep.extend([sorted(c) for c in cycle_indices])
+            continue
+        keep.append(pair)
+    keep = [t.to(device) for t in keep]
+    return torch.stack(keep).t()
+def batch_distance_metrics_from_coords(coords, mask):
+    """
+    Given coordinates of neighboring atoms, compute bond
+    distances and 2-hop distances in local neighborhood
+    """
+    d_mat_mask = mask.unsqueeze(1) * mask.unsqueeze(2)
+    if coords.dim() == 4:
+        two_dop_d_mat = torch.square(coords.unsqueeze(1) - coords.unsqueeze(2) + 1e-10).sum(dim=-1).sqrt() * d_mat_mask.unsqueeze(-1)
+        one_hop_ds = torch.linalg.norm(torch.zeros_like(coords[0]).unsqueeze(0) - coords, dim=-1)
+    elif coords.dim() == 5:
+        two_dop_d_mat = torch.square(coords.unsqueeze(2) - coords.unsqueeze(3) + 1e-10).sum(dim=-1).sqrt() * d_mat_mask.unsqueeze(-1).unsqueeze(1)
+        one_hop_ds = torch.linalg.norm(torch.zeros_like(coords[0]).unsqueeze(0) - coords, dim=-1)
+    return one_hop_ds, two_dop_d_mat
+def batch_angle_between_vectors(a, b):
+    """
+    Compute angle between two batches of input vectors
+    """
+    inner_product = (a * b).sum(dim=-1)
+    # norms
+    a_norm = torch.linalg.norm(a, dim=-1)
+    b_norm = torch.linalg.norm(b, dim=-1)
+    # protect denominator during division
+    den = a_norm * b_norm + 1e-10
+    cos = inner_product / den
+    return cos
+def batch_angles_from_coords(coords, mask):
+    """
+    Given coordinates, compute all local neighborhood angles
+    """
+    if coords.dim() == 4:
+        all_possible_combos = coords[:, angle_combos]
+        v_a, v_b = all_possible_combos.split(1, dim=2)  # does one of these need to be negative?
+        angle_mask = angle_mask_ref[mask.sum(dim=1).long()]
+        angles = batch_angle_between_vectors(v_a.squeeze(2), v_b.squeeze(2)) * angle_mask.unsqueeze(-1)
+    elif coords.dim() == 5:
+        all_possible_combos = coords[:, :, angle_combos]
+        v_a, v_b = all_possible_combos.split(1, dim=3)  # does one of these need to be negative?
+        angle_mask = angle_mask_ref[mask.sum(dim=1).long()]
+        angles = batch_angle_between_vectors(v_a.squeeze(3), v_b.squeeze(3)) * angle_mask.unsqueeze(-1).unsqueeze(-1)
+    return angles
+def batch_local_stats_from_coords(coords, mask):
+    """
+    Given neighborhood neighbor coordinates, compute bond distances,
+    2-hop distances, and angles in local neighborhood (this assumes
+    the central atom has coordinates at the origin)
+    """
+    one_hop_ds, two_dop_d_mat = batch_distance_metrics_from_coords(coords, mask)
+    angles = batch_angles_from_coords(coords, mask)
+    return one_hop_ds, two_dop_d_mat, angles
+def batch_dihedrals(p0, p1, p2, p3, angle=False):
+    s1 = p1 - p0
+    s2 = p2 - p1
+    s3 = p3 - p2
+    sin_d_ = torch.linalg.norm(s2, dim=-1) * torch.sum(s1 * torch.cross(s2, s3, dim=-1), dim=-1)
+    cos_d_ = torch.sum(torch.cross(s1, s2, dim=-1) * torch.cross(s2, s3, dim=-1), dim=-1)
+    if angle:
+        return torch.atan2(sin_d_, cos_d_ + 1e-10)
+    else:
+        den = torch.linalg.norm(torch.cross(s1, s2, dim=-1), dim=-1) * torch.linalg.norm(torch.cross(s2, s3, dim=-1), dim=-1) + 1e-10
+        return sin_d_/den, cos_d_/den
+def batch_vector_angles(xn, x, y, yn):
+    uT = xn.view(-1, 3)
+    uX = x.view(-1, 3)
+    uY = y.view(-1, 3)
+    uZ = yn.view(-1, 3)
+    b1 = uT - uX
+    b2 = uZ - uY
+    num = torch.bmm(b1.view(-1, 1, 3), b2.view(-1, 3, 1)).squeeze(-1).squeeze(-1)
+    den = torch.linalg.norm(b1, dim=-1) * torch.linalg.norm(b2, dim=-1) + 1e-10
+    return (num / den).view(-1, 9)
+def von_Mises_loss(a, b, a_sin=None, b_sin=None):
+    """
+    :param a: cos of first angle
+    :param b: cos of second angle
+    :return: difference of cosines
+    """
+    if torch.is_tensor(a_sin):
+        out = a * b + a_sin * b_sin
+    else:
+        out = a * b + torch.sqrt(1-a**2 + 1e-5) * torch.sqrt(1-b**2 + 1e-5)
+    return out
+def rotation_matrix(neighbor_coords, neighbor_mask, neighbor_map, mu=None):
+    """
+    Given predicted neighbor coordinates from model, return rotation matrix
+    :param neighbor_coords: neighbor coordinates for each edge as defined by dihedral_pairs
+        (n_dihedral_pairs, 4, n_generated_confs, 3)
+    :param neighbor_mask: mask describing which atoms are present (n_dihedral_pairs, 4)
+    :param neighbor_map: mask describing which neighbor corresponds to the other central dihedral atom
+        (n_dihedral_pairs, 4) each entry in neighbor_map should have one TRUE entry with the rest as FALSE
+    :return: rotation matrix (n_dihedral_pairs, n_model_confs, 3, 3)
+    """
+    if not torch.is_tensor(mu):
+        # mu = neighbor_coords.sum(dim=1, keepdim=True) / (neighbor_mask.sum(dim=-1, keepdim=True).unsqueeze(-1).unsqueeze(-1) + 1e-10)
+        mu_num = neighbor_coords[~neighbor_map.bool()].view(neighbor_coords.size(0), 3, neighbor_coords.size(2), -1).sum(dim=1)
+        mu_den = (neighbor_mask.sum(dim=-1, keepdim=True).unsqueeze(-1) - 1 + 1e-10)
+        mu = mu_num / mu_den  # (n_dihedral_pairs, n_model_confs, 10)
+        mu = mu.squeeze(1)  # (n_dihedral_pairs, n_model_confs, 10)
+    p_Y = neighbor_coords[neighbor_map.bool(), :]
+    h1 = p_Y / (torch.linalg.norm(p_Y, dim=-1, keepdim=True) + 1e-10)  # (n_dihedral_pairs, n_model_confs, 10)
+    h3_1 = torch.cross(p_Y, mu, dim=-1)
+    h3 = h3_1 / (torch.linalg.norm(h3_1, dim=-1, keepdim=True) + 1e-10)  # (n_dihedral_pairs, n_model_confs, 10)
+    h2 = -torch.cross(h1, h3, dim=-1)  # (n_dihedral_pairs, n_model_confs, 10)
+    H = torch.cat([h1.unsqueeze(-2),
+                   h2.unsqueeze(-2),
+                   h3.unsqueeze(-2)], dim=-2)
+    return H
+def rotation_matrix_v2(neighbor_coords):
+    """
+    Given predicted neighbor coordinates from model, return rotation matrix
+    :param neighbor_coords: y or x coordinates for the x or y center node
+        (n_dihedral_pairs, 3)
+    :return: rotation matrix (n_dihedral_pairs, 3, 3)
+    """
+    p_Y = neighbor_coords
+    eta_1 = torch.rand_like(p_Y)
+    eta_2 = eta_1 - torch.sum(eta_1 * p_Y, dim=-1, keepdim=True) / (torch.linalg.norm(p_Y, dim=-1, keepdim=True)**2 + 1e-10) * p_Y
+    eta = eta_2 / torch.linalg.norm(eta_2, dim=-1, keepdim=True)
+    h1 = p_Y / (torch.linalg.norm(p_Y, dim=-1, keepdim=True) + 1e-10)  # (n_dihedral_pairs, n_model_confs, 10)
+    h3_1 = torch.cross(p_Y, eta, dim=-1)
+    h3 = h3_1 / (torch.linalg.norm(h3_1, dim=-1, keepdim=True) + 1e-10)  # (n_dihedral_pairs, n_model_confs, 10)
+    h2 = -torch.cross(h1, h3, dim=-1)  # (n_dihedral_pairs, n_model_confs, 10)
+    H = torch.cat([h1.unsqueeze(-2),
+                   h2.unsqueeze(-2),
+                   h3.unsqueeze(-2)], dim=-2)
+    return H
+def signed_volume(local_coords):
+    """
+    Compute signed volume given ordered neighbor local coordinates
+    :param local_coords: (n_tetrahedral_chiral_centers, 4, n_generated_confs, 3)
+    :return: signed volume of each tetrahedral center (n_tetrahedral_chiral_centers, n_generated_confs)
+    """
+    v1 = local_coords[:, 0] - local_coords[:, 3]
+    v2 = local_coords[:, 1] - local_coords[:, 3]
+    v3 = local_coords[:, 2] - local_coords[:, 3]
+    cp = v2.cross(v3, dim=-1)
+    vol = torch.sum(v1 * cp, dim=-1)
+    return torch.sign(vol)
+def rotation_matrix_inf(neighbor_coords, neighbor_mask, neighbor_map):
+    """
+    Given predicted neighbor coordinates from model, return rotation matrix
+    :param neighbor_coords: neighbor coordinates for each edge as defined by dihedral_pairs (4, n_model_confs, 3)
+    :param neighbor_mask: mask describing which atoms are present (4)
+    :param neighbor_map: mask describing which neighbor corresponds to the other central dihedral atom (4)
+        each entry in neighbor_map should have one TRUE entry with the rest as FALSE
+    :return: rotation matrix (3, 3)
+    """
+    mu = neighbor_coords.sum(dim=0, keepdim=True) / (neighbor_mask.sum(dim=-1, keepdim=True).unsqueeze(-1) + 1e-10)
+    mu = mu.squeeze(0)
+    p_Y = neighbor_coords[neighbor_map.bool(), :].squeeze(0)
+    h1 = p_Y / (torch.linalg.norm(p_Y, dim=-1, keepdim=True) + 1e-10)
+    h3_1 = torch.cross(p_Y, mu, dim=-1)
+    h3 = h3_1 / (torch.linalg.norm(h3_1, dim=-1, keepdim=True) + 1e-10)
+    h2 = -torch.cross(h1, h3, dim=-1)
+    H = torch.cat([h1.unsqueeze(-2),
+                   h2.unsqueeze(-2),
+                   h3.unsqueeze(-2)], dim=-2)
+    return H
+def build_alpha_rotation_inf(alpha, n_model_confs):
+    H_alpha = torch.FloatTensor([[[1, 0, 0], [0, 0, 0], [0, 0, 0]]]).repeat(n_model_confs, 1, 1)
+    H_alpha[:, 1, 1] = torch.cos(alpha)
+    H_alpha[:, 1, 2] = -torch.sin(alpha)
+    H_alpha[:, 2, 1] = torch.sin(alpha)
+    H_alpha[:, 2, 2] = torch.cos(alpha)
+    return H_alpha
+def random_rotation_matrix(dim):
+    yaw = torch.rand(dim)
+    pitch = torch.rand(dim)
+    roll = torch.rand(dim)
+    R = torch.stack([torch.stack([torch.cos(yaw) * torch.cos(pitch),
+                                  torch.cos(yaw) * torch.sin(pitch) * torch.sin(roll) - torch.sin(yaw) * torch.cos(
+                                      roll),
+                                  torch.cos(yaw) * torch.sin(pitch) * torch.cos(roll) + torch.sin(yaw) * torch.sin(
+                                      roll)], dim=-1),
+                     torch.stack([torch.sin(yaw) * torch.cos(pitch),
+                                  torch.sin(yaw) * torch.sin(pitch) * torch.sin(roll) + torch.cos(yaw) * torch.cos(
+                                      roll),
+                                  torch.sin(yaw) * torch.sin(pitch) * torch.cos(roll) - torch.cos(yaw) * torch.sin(
+                                      roll)], dim=-1),
+                     torch.stack([-torch.sin(pitch),
+                                  torch.cos(pitch) * torch.sin(roll),
+                                  torch.cos(pitch) * torch.cos(roll)], dim=-1)], dim=-2)
+    return R
+def length_to_mask(length, max_len=None, dtype=None):
+    """length: B.
+    return B x max_len.
+    If max_len is None, then max of length will be used.
+    """
+    assert len(length.shape) == 1, 'Length shape should be 1 dimensional.'
+    max_len = max_len or length.max().item()
+    mask = torch.arange(max_len, device=length.device,
+                        dtype=length.dtype).expand(len(length), max_len) < length.unsqueeze(1)
+    if dtype is not None:
+        mask = torch.as_tensor(mask, dtype=dtype, device=length.device)
+    return mask

utils/docking.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import os
+import subprocess
+import random
+import string
+from easydict import EasyDict
+from rdkit import Chem
+from rdkit.Chem.rdForceFieldHelpers import UFFOptimizeMolecule
+from .reconstruct import reconstruct_from_generated
+def get_random_id(length=30):
+    letters = string.ascii_lowercase
+    return ''.join(random.choice(letters) for i in range(length))
+def load_pdb(path):
+    with open(path, 'r') as f:
+        return f.read()
+def parse_qvina_outputs(docked_sdf_path):
+    suppl = Chem.SDMolSupplier(docked_sdf_path)
+    results = []
+    for i, mol in enumerate(suppl):
+        if mol is None:
+            continue
+        line = mol.GetProp('REMARK').splitlines()[0].split()[2:]
+        results.append(EasyDict({
+            'rdmol': mol,
+            'mode_id': i,
+            'affinity': float(line[0]),
+            'rmsd_lb': float(line[1]),
+            'rmsd_ub': float(line[2]),
+        }))
+    return results
+class BaseDockingTask(object):
+    def __init__(self, pdb_block, ligand_rdmol):
+        super().__init__()
+        self.pdb_block = pdb_block
+        self.ligand_rdmol = ligand_rdmol
+    def run(self):
+        raise NotImplementedError()
+    def get_results(self):
+        raise NotImplementedError()
+class QVinaDockingTask(BaseDockingTask):
+    @classmethod
+    def from_generated_data(cls, data, protein_root='./data/crossdocked', **kwargs):
+        protein_fn = os.path.join(
+            os.path.dirname(data.ligand_filename),
+            os.path.basename(data.ligand_filename)[:10] + '.pdb'
+        )
+        protein_path = os.path.join(protein_root, protein_fn)
+        with open(protein_path, 'r') as f:
+            pdb_block = f.read()
+        ligand_rdmol = reconstruct_from_generated(data)
+        return cls(pdb_block, ligand_rdmol, **kwargs)
+    @classmethod
+    def from_original_data(cls, data, ligand_root='./data/crossdocked_pocket10', protein_root='./data/crossdocked', **kwargs):
+        protein_fn = os.path.join(
+            os.path.dirname(data.ligand_filename),
+            os.path.basename(data.ligand_filename)[:10] + '.pdb'
+        )
+        protein_path = os.path.join(protein_root, protein_fn)
+        with open(protein_path, 'r') as f:
+            pdb_block = f.read()
+        ligand_path = os.path.join(ligand_root, data.ligand_filename)
+        ligand_rdmol = next(iter(Chem.SDMolSupplier(ligand_path)))
+        return cls(pdb_block, ligand_rdmol, **kwargs)
+    def __init__(self, pdb_block, ligand_rdmol, conda_env='adt', tmp_dir='./tmp', use_uff=True, center=None):
+        super().__init__(pdb_block, ligand_rdmol)
+        self.conda_env = conda_env
+        self.tmp_dir = os.path.realpath(tmp_dir)
+        os.makedirs(tmp_dir, exist_ok=True)
+        self.task_id = get_random_id()
+        self.receptor_id = self.task_id + '_receptor'
+        self.ligand_id = self.task_id + '_ligand'
+        self.receptor_path = os.path.join(self.tmp_dir, self.receptor_id + '.pdb')
+        self.ligand_path = os.path.join(self.tmp_dir, self.ligand_id + '.sdf')
+        with open(self.receptor_path, 'w') as f:
+            f.write(pdb_block)
+        ligand_rdmol = Chem.AddHs(ligand_rdmol, addCoords=True)
+        if use_uff:
+            UFFOptimizeMolecule(ligand_rdmol)
+        sdf_writer = Chem.SDWriter(self.ligand_path)
+        sdf_writer.write(ligand_rdmol)
+        sdf_writer.close()
+        self.ligand_rdmol = ligand_rdmol
+        pos = ligand_rdmol.GetConformer(0).GetPositions()
+        if center is None:
+            self.center = (pos.max(0) + pos.min(0)) / 2
+        else:
+            self.center = center
+        self.proc = None
+        self.results = None
+        self.output = None
+        self.docked_sdf_path = None
+    def run(self, exhaustiveness=16):
+        commands = """
+eval "$(conda shell.bash hook)"
+conda activate {env}
+cd {tmp}
+# Prepare receptor (PDB->PDBQT)
+prepare_receptor4.py -r {receptor_id}.pdb
+# Prepare ligand
+obabel {ligand_id}.sdf -O{ligand_id}.pdbqt
+qvina2.1 \
+    --receptor {receptor_id}.pdbqt \
+    --ligand {ligand_id}.pdbqt \
+    --center_x {center_x:.4f} \
+    --center_y {center_y:.4f} \
+    --center_z {center_z:.4f} \
+    --size_x 20 --size_y 20 --size_z 20 \
+    --exhaustiveness {exhaust}
+obabel {ligand_id}_out.pdbqt -O{ligand_id}_out.sdf -h
+        """.format(
+            receptor_id = self.receptor_id,
+            ligand_id = self.ligand_id,
+            env = self.conda_env,
+            tmp = self.tmp_dir,
+            exhaust = exhaustiveness,
+            center_x = self.center[0],
+            center_y = self.center[1],
+            center_z = self.center[2],
+        )
+        self.docked_sdf_path = os.path.join(self.tmp_dir, '%s_out.sdf' % self.ligand_id)
+        self.proc = subprocess.Popen(
+            '/bin/bash',
+            shell=False,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE
+        )
+        self.proc.stdin.write(commands.encode('utf-8'))
+        self.proc.stdin.close()
+        # return commands
+    def run_sync(self):
+        self.run()
+        while self.get_results() is None:
+            pass
+        results = self.get_results()
+        print('Best affinity:', results[0]['affinity'])
+        return results
+    def get_results(self):
+        if self.proc is None:   # Not started
+            return None
+        elif self.proc.poll() is None:  # In progress
+            return None
+        else:
+            if self.output is None:
+                self.output = self.proc.stdout.readlines()
+                try:
+                    self.results = parse_qvina_outputs(self.docked_sdf_path)
+                except:
+                    print('[Error] Vina output error: %s' % self.docked_sdf_path)
+                    return []
+            return self.results

utils/fpscores.pkl.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10dcef9340c873e7b987924461b0af5365eb8dd96be607203debe8ddf80c1e73
+size 3848394

utils/misc.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import os
+import time
+import random
+import logging
+import torch
+import numpy as np
+import yaml
+from easydict import EasyDict
+from logging import Logger
+from tqdm.auto import tqdm
+class BlackHole(object):
+    def __setattr__(self, name, value):
+        pass
+    def __call__(self, *args, **kwargs):
+        return self
+    def __getattr__(self, name):
+        return self
+def load_config(path):
+    with open(path, 'r') as f:
+        return EasyDict(yaml.safe_load(f))
+def get_logger(name, log_dir=None):
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    formatter = logging.Formatter('[%(asctime)s::%(name)s::%(levelname)s] %(message)s')
+    stream_handler = logging.StreamHandler()
+    stream_handler.setLevel(logging.DEBUG)
+    stream_handler.setFormatter(formatter)
+    logger.addHandler(stream_handler)
+    if log_dir is not None:
+        file_handler = logging.FileHandler(os.path.join(log_dir, 'log.txt'))
+        file_handler.setLevel(logging.DEBUG)
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+    return logger
+def get_new_log_dir(root='./logs', prefix='', tag=''):
+    fn = time.strftime('%Y_%m_%d__%H_%M_%S', time.localtime())
+    if prefix != '':
+        fn = prefix + '_' + fn
+    if tag != '':
+        fn = fn + '_' + tag
+    log_dir = os.path.join(root, fn)
+    os.makedirs(log_dir)
+    return log_dir
+def seed_all(seed):
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+def log_hyperparams(writer, args):
+    from torch.utils.tensorboard.summary import hparams
+    vars_args = {k:v if isinstance(v, str) else repr(v) for k, v in vars(args).items()}
+    exp, ssi, sei = hparams(vars_args, {})
+    writer.file_writer.add_summary(exp)
+    writer.file_writer.add_summary(ssi)
+    writer.file_writer.add_summary(sei)
+def int_tuple(argstr):
+    return tuple(map(int, argstr.split(',')))
+def str_tuple(argstr):
+    return tuple(argstr.split(','))

utils/mol_tree.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import sys
+sys.path.append("..")
+import rdkit
+import rdkit.Chem as Chem
+import copy
+import pickle
+from tqdm.auto import tqdm
+import numpy as np
+import torch
+import random
+from .chemutils import get_clique_mol, tree_decomp, get_mol, get_smiles, set_atommap, get_clique_mol_simple
+from collections import defaultdict
+def get_slots(smiles):
+    mol = Chem.MolFromSmiles(smiles, sanitize=False)
+    return [(atom.GetSymbol(), atom.GetFormalCharge(), atom.GetTotalNumHs()) for atom in mol.GetAtoms()]
+class Vocab(object):
+    def __init__(self, smiles_list):
+        self.vocab = smiles_list
+        self.vmap = {x: i for i, x in enumerate(self.vocab)}
+        #self.slots = [get_slots(smiles) for smiles in self.vocab]
+    def get_index(self, smiles):
+        if smiles in self.vmap.keys():
+            return self.vmap[smiles]
+        else:
+            return 0
+    def get_smiles(self, idx):
+        return self.vocab[idx]
+    def get_slots(self, idx):
+        return copy.deepcopy(self.slots[idx])
+    def size(self):
+        return len(self.vocab)
+class MolTreeNode(object):
+    def __init__(self, mol, cmol, clique):
+        self.smiles = Chem.MolToSmiles(cmol, canonical=True)
+        self.mol = cmol
+        self.clique = [x for x in clique]  # copy
+        self.neighbors = []
+        self.rotatable = False
+        if len(self.clique) == 2:
+            if mol.GetAtomWithIdx(self.clique[0]).GetDegree() >= 2 and mol.GetAtomWithIdx(self.clique[1]).GetDegree() >= 2:
+                self.rotatable = True
+        # should restrict to single bond, but double bond is ok
+    def add_neighbor(self, nei_node):
+        self.neighbors.append(nei_node)
+    def recover(self, original_mol):
+        clique = []
+        clique.extend(self.clique)
+        if not self.is_leaf:
+            for cidx in self.clique:
+                original_mol.GetAtomWithIdx(cidx).SetAtomMapNum(self.nid)
+        for nei_node in self.neighbors:
+            clique.extend(nei_node.clique)
+            if nei_node.is_leaf:  # Leaf node, no need to mark
+                continue
+            for cidx in nei_node.clique:
+                # allow singleton node override the atom mapping
+                if cidx not in self.clique or len(nei_node.clique) == 1:
+                    atom = original_mol.GetAtomWithIdx(cidx)
+                    atom.SetAtomMapNum(nei_node.nid)
+        clique = list(set(clique))
+        label_mol = get_clique_mol_simple(original_mol, clique)
+        self.label = Chem.MolToSmiles(Chem.MolFromSmiles(get_smiles(label_mol)))
+        self.label_mol = get_mol(self.label)
+        for cidx in clique:
+            original_mol.GetAtomWithIdx(cidx).SetAtomMapNum(0)
+        return self.label
+    def assemble(self):
+        # neighbors = [nei for nei in self.neighbors if nei.mol.GetNumAtoms() > 1]
+        neighbors = sorted(self.neighbors, key=lambda x: x.mol.GetNumAtoms(), reverse=True)
+        # singletons = [nei for nei in self.neighbors if nei.mol.GetNumAtoms() == 1]
+        # neighbors = singletons + neighbors
+        cands = enum_assemble(self, neighbors)
+        if len(cands) > 0:
+            self.cands, self.cand_mols, _ = zip(*cands)
+            self.cands = list(self.cands)
+            self.cand_mols = list(self.cand_mols)
+        else:
+            self.cands = []
+            self.cand_mols = []
+class MolTree(object):
+    def __init__(self, mol):
+        self.smiles = Chem.MolToSmiles(mol)
+        self.mol = mol
+        self.num_rotatable_bond = 0
+        '''
+        # use reference_vocab and threshold to control the size of vocab
+        reference_vocab = np.load('./utils/reference.npy', allow_pickle=True).item()
+        reference = defaultdict(int)
+        for k, v in reference_vocab.items():
+            reference[k] = v'''
+        # use vanilla tree decomposition for simplicity
+        cliques, edges = tree_decomp(self.mol, reference_vocab=None)
+        self.nodes = []
+        root = 0
+        for i, c in enumerate(cliques):
+            cmol = get_clique_mol_simple(self.mol, c)
+            node = MolTreeNode(self.mol, cmol, c)
+            self.nodes.append(node)
+            if min(c) == 0:
+                root = i
+        for node in self.nodes:
+            if node.rotatable:
+                self.num_rotatable_bond += 1
+        for x, y in edges:
+            self.nodes[x].add_neighbor(self.nodes[y])
+            self.nodes[y].add_neighbor(self.nodes[x])
+        if root > 0:
+            self.nodes[0], self.nodes[root] = self.nodes[root], self.nodes[0]
+        for i, node in enumerate(self.nodes):
+            node.nid = i + 1
+            '''
+            if len(node.neighbors) > 1:  # Leaf node mol is not marked
+                set_atommap(node.mol, node.nid)
+            node.is_leaf = (len(node.neighbors) == 1)'''
+    def size(self):
+        return len(self.nodes)
+    def recover(self):
+        for node in self.nodes:
+            node.recover(self.mol)
+    def assemble(self):
+        for node in self.nodes:
+            node.assemble()
+if __name__ == "__main__":
+    seed = 2023
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    vocab = {}
+    cnt = 0
+    rot = 0
+    '''
+    index_path = './data/crossdocked_pocket10/index.pkl'
+    with open(index_path, 'rb') as f:
+        index = pickle.load(f)
+    for i, (pocket_fn, ligand_fn, _, rmsd_str) in enumerate(tqdm(index)):
+        if pocket_fn is None: continue
+        try:
+            path = './data/crossdocked_pocket10/' + ligand_fn
+            mol = Chem.MolFromMolFile(path, sanitize=False)
+            moltree = MolTree(mol)
+            cnt += 1
+            if moltree.num_rotatable_bond > 0:
+                rot += 1
+        except:
+            continue
+        for c in moltree.nodes:
+            smile_cluster = c.smiles
+            if smile_cluster not in vocab:
+                vocab[smile_cluster] = 1
+            else:
+                vocab[smile_cluster] += 1
+    '''
+    index = torch.load('/n/holyscratch01/mzitnik_lab/zaixizhang/pdbbind_pocket10/index.pt')
+    for i, pdbid in enumerate(tqdm(index)):
+        if pdbid is None: continue
+        try:
+            path = '/n/holyscratch01/mzitnik_lab/zaixizhang/pdbbind_pocket10/'
+            ligand_path = os.path.join(path, os.path.join(item, item+'_ligand.sdf'))
+            mol = Chem.MolFromMolFile(ligand_path, sanitize=False)
+            moltree = MolTree(mol)
+            cnt += 1
+            if moltree.num_rotatable_bond > 0:
+                rot += 1
+        except:
+            continue
+        for c in moltree.nodes:
+            smile_cluster = c.smiles
+            if smile_cluster not in vocab:
+                vocab[smile_cluster] = 1
+            else:
+                vocab[smile_cluster] += 1
+    vocab = dict(sorted(vocab.items(), key=lambda kv: (kv[1], kv[0]), reverse=True))
+    filename = open('./vocab.txt', 'w')
+    for k, v in vocab.items():
+        filename.write(k + ':' + str(v))
+        filename.write('\n')
+    filename.close()
+    # number of molecules and vocab
+    print('Size of the motif vocab:', len(vocab))
+    print('Total number of molecules', cnt)
+    print('Percent of molecules with rotatable bonds:', rot / cnt)

utils/protein_ligand.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import sys
+sys.path.append("..")
+import os
+import numpy as np
+from rdkit import Chem
+from rdkit.Chem.rdchem import BondType
+from rdkit.Chem import ChemicalFeatures
+from rdkit import RDConfig
+from .mol_tree import *
+ATOM_FAMILIES = ['Acceptor', 'Donor', 'Aromatic', 'Hydrophobe', 'LumpedHydrophobe', 'NegIonizable', 'PosIonizable', 'ZnBinder']
+ATOM_FAMILIES_ID = {s: i for i, s in enumerate(ATOM_FAMILIES)}
+BOND_TYPES = {t: i for i, t in enumerate(BondType.names.values())}
+BOND_NAMES = {i: t for i, t in enumerate(BondType.names.keys())}
+class PDBProtein(object):
+    AA_NAME_SYM = {
+        'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F', 'GLY': 'G', 'HIS': 'H',
+        'ILE': 'I', 'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q',
+        'ARG': 'R', 'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y',
+    }
+    AA_NAME_NUMBER = {
+        k: i for i, (k, _) in enumerate(AA_NAME_SYM.items())
+    }
+    BACKBONE_NAMES = ["CA", "C", "N", "O"]
+    def __init__(self, data, mode='auto'):
+        super().__init__()
+        if (data[-4:].lower() == '.pdb' and mode == 'auto') or mode == 'path':
+            with open(data, 'r') as f:
+                self.block = f.read()
+        else:
+            self.block = data
+        self.ptable = Chem.GetPeriodicTable()
+        # Molecule properties
+        self.title = None
+        # Atom properties
+        self.atoms = []
+        self.element = []
+        self.atomic_weight = []
+        self.pos = []
+        self.atom_name = []
+        self.is_backbone = []
+        self.atom_to_aa_type = []
+        # Residue properties
+        self.residues = []
+        self.amino_acid = []
+        self.center_of_mass = []
+        self.pos_CA = []
+        self.pos_C = []
+        self.pos_N = []
+        self.pos_O = []
+        self._parse()
+    def _enum_formatted_atom_lines(self):
+        for line in self.block.splitlines():
+            if line[0:6].strip() == 'ATOM':
+                element_symb = line[76:78].strip().capitalize()
+                if len(element_symb) == 0:
+                    element_symb = line[13:14]
+                yield {
+                    'line': line,
+                    'type': 'ATOM',
+                    'atom_id': int(line[6:11]),
+                    'atom_name': line[12:16].strip(),
+                    'res_name': line[17:20].strip(),
+                    'chain': line[21:22].strip(),
+                    'res_id': int(line[22:26]),
+                    'res_insert_id': line[26:27].strip(),
+                    'x': float(line[30:38]),
+                    'y': float(line[38:46]),
+                    'z': float(line[46:54]),
+                    'occupancy': float(line[54:60]),
+                    'segment': line[72:76].strip(),
+                    'element_symb': element_symb,
+                    'charge': line[78:80].strip(),
+                }
+            elif line[0:6].strip() == 'HEADER':
+                yield {
+                    'type': 'HEADER',
+                    'value': line[10:].strip()
+                }
+            elif line[0:6].strip() == 'ENDMDL':
+                break   # Some PDBs have more than 1 model.
+    def _parse(self):
+        # Process atoms
+        residues_tmp = {}
+        for atom in self._enum_formatted_atom_lines():
+            if atom['type'] == 'HEADER':
+                self.title = atom['value'].lower()
+                continue
+            self.atoms.append(atom)
+            atomic_number = self.ptable.GetAtomicNumber(atom['element_symb'])
+            next_ptr = len(self.element)
+            self.element.append(atomic_number)
+            self.atomic_weight.append(self.ptable.GetAtomicWeight(atomic_number))
+            self.pos.append(np.array([atom['x'], atom['y'], atom['z']], dtype=np.float32))
+            self.atom_name.append(atom['atom_name'])
+            self.is_backbone.append(atom['atom_name'] in self.BACKBONE_NAMES)
+            self.atom_to_aa_type.append(self.AA_NAME_NUMBER[atom['res_name']])
+            chain_res_id = '%s_%s_%d_%s' % (atom['chain'], atom['segment'], atom['res_id'], atom['res_insert_id'])
+            if chain_res_id not in residues_tmp:
+                residues_tmp[chain_res_id] = {
+                    'name': atom['res_name'],
+                    'atoms': [next_ptr],
+                    'chain': atom['chain'],
+                    'segment': atom['segment'],
+                }
+            else:
+                assert residues_tmp[chain_res_id]['name'] == atom['res_name']
+                assert residues_tmp[chain_res_id]['chain'] == atom['chain']
+                residues_tmp[chain_res_id]['atoms'].append(next_ptr)
+        # Process residues
+        self.residues = [r for _, r in residues_tmp.items()]
+        for residue in self.residues:
+            sum_pos = np.zeros([3], dtype=np.float32)
+            sum_mass = 0.0
+            for atom_idx in residue['atoms']:
+                sum_pos += self.pos[atom_idx] * self.atomic_weight[atom_idx]
+                sum_mass += self.atomic_weight[atom_idx]
+                if self.atom_name[atom_idx] in self.BACKBONE_NAMES:
+                    residue['pos_%s' % self.atom_name[atom_idx]] = self.pos[atom_idx]
+            residue['center_of_mass'] = sum_pos / sum_mass
+        # Process backbone atoms of residues
+        for residue in self.residues:
+            self.amino_acid.append(self.AA_NAME_NUMBER[residue['name']])
+            self.center_of_mass.append(residue['center_of_mass'])
+            for name in self.BACKBONE_NAMES:
+                pos_key = 'pos_%s' % name   # pos_CA, pos_C, pos_N, pos_O
+                if pos_key in residue:
+                    getattr(self, pos_key).append(residue[pos_key])
+                else:
+                    getattr(self, pos_key).append(residue['center_of_mass'])
+    def to_dict_atom(self):
+        return {
+            'element': np.array(self.element, dtype=np.int_),
+            'molecule_name': self.title,
+            'pos': np.array(self.pos, dtype=np.float32),
+            'is_backbone': np.array(self.is_backbone, dtype=bool),
+            'atom_name': self.atom_name,
+            'atom_to_aa_type': np.array(self.atom_to_aa_type, dtype=np.int_)
+        }
+    def to_dict_residue(self):
+        return {
+            'amino_acid': np.array(self.amino_acid, dtype=np.int_),
+            'center_of_mass': np.array(self.center_of_mass, dtype=np.float32),
+            'pos_CA': np.array(self.pos_CA, dtype=np.float32),
+            'pos_C': np.array(self.pos_C, dtype=np.float32),
+            'pos_N': np.array(self.pos_N, dtype=np.float32),
+            'pos_O': np.array(self.pos_O, dtype=np.float32),
+        }
+    def query_residues_radius(self, center, radius, criterion='center_of_mass'):
+        center = np.array(center).reshape(3)
+        selected = []
+        for residue in self.residues:
+            distance = np.linalg.norm(residue[criterion] - center, ord=2)
+            print(residue[criterion], distance)
+            if distance < radius:
+                selected.append(residue)
+        return selected
+    def query_residues_ligand(self, ligand, radius, criterion='center_of_mass'):
+        selected = []
+        sel_idx = set()
+        # The time-complexity is O(mn).
+        for center in ligand['pos']:
+            for i, residue in enumerate(self.residues):
+                distance = np.linalg.norm(residue[criterion] - center, ord=2)
+                if distance < radius and i not in sel_idx:
+                    selected.append(residue)
+                    sel_idx.add(i)
+        return selected
+    def residues_to_pdb_block(self, residues, name='POCKET'):
+        block =  "HEADER    %s\n" % name
+        block += "COMPND    %s\n" % name
+        for residue in residues:
+            for atom_idx in residue['atoms']:
+                block += self.atoms[atom_idx]['line'] + "\n"
+        block += "END\n"
+        return block
+def parse_pdbbind_index_file(path):
+    pdb_id = []
+    with open(path, 'r') as f:
+        lines = f.readlines()
+    for line in lines:
+        if line.startswith('#'): continue
+        pdb_id.append(line.split()[0])
+    return pdb_id
+def parse_sdf_file(path):
+    mol = Chem.MolFromMolFile(path, sanitize=True)
+    moltree = MolTree(mol)
+    fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef')
+    factory = ChemicalFeatures.BuildFeatureFactory(fdefName)
+    rdmol = next(iter(Chem.SDMolSupplier(path, removeHs=True)))
+    rd_num_atoms = rdmol.GetNumAtoms()
+    feat_mat = np.zeros([rd_num_atoms, len(ATOM_FAMILIES)], dtype=np.int_)
+    for feat in factory.GetFeaturesForMol(rdmol):
+        feat_mat[feat.GetAtomIds(), ATOM_FAMILIES_ID[feat.GetFamily()]] = 1
+    with open(path, 'r') as f:
+        sdf = f.read()
+    sdf = sdf.splitlines()
+    num_atoms, num_bonds = map(int, [sdf[3][0:3], sdf[3][3:6]])
+    assert num_atoms == rd_num_atoms
+    ptable = Chem.GetPeriodicTable()
+    element, pos = [], []
+    accum_pos = np.array([0.0, 0.0, 0.0], dtype=np.float32)
+    accum_mass = 0.0
+    for atom_line in map(lambda x:x.split(), sdf[4:4+num_atoms]):
+        x, y, z = map(float, atom_line[:3])
+        symb = atom_line[3]
+        atomic_number = ptable.GetAtomicNumber(symb.capitalize())
+        element.append(atomic_number)
+        pos.append([x, y, z])
+        atomic_weight = ptable.GetAtomicWeight(atomic_number)
+        accum_pos += np.array([x, y, z]) * atomic_weight
+        accum_mass += atomic_weight
+    center_of_mass = np.array(accum_pos / accum_mass, dtype=np.float32)
+    element = np.array(element, dtype=np.int_)
+    pos = np.array(pos, dtype=np.float32)
+    BOND_TYPES = {t: i for i, t in enumerate(BondType.names.values())}
+    bond_type_map = {
+        1: BOND_TYPES[BondType.SINGLE],
+        2: BOND_TYPES[BondType.DOUBLE],
+        3: BOND_TYPES[BondType.TRIPLE],
+        4: BOND_TYPES[BondType.AROMATIC],
+    }
+    row, col, edge_type = [], [], []
+    for bond_line in sdf[4+num_atoms:4+num_atoms+num_bonds]:
+        start, end = int(bond_line[0:3])-1, int(bond_line[3:6])-1
+        row += [start, end]
+        col += [end, start]
+        edge_type += 2 * [bond_type_map[int(bond_line[6:9])]]
+    edge_index = np.array([row, col], dtype=np.int_)
+    edge_type = np.array(edge_type, dtype=np.int_)
+    perm = (edge_index[0] * num_atoms + edge_index[1]).argsort()
+    edge_index = edge_index[:, perm]
+    edge_type = edge_type[perm]
+    neighbor_dict = {}
+    #used in rotation angle prediction
+    for i, atom in enumerate(mol.GetAtoms()):
+        neighbor_dict[i] = [n.GetIdx() for n in atom.GetNeighbors()]
+    data = {
+        'element': element,
+        'pos': pos,
+        'bond_index': edge_index,
+        'bond_type': edge_type,
+        'center_of_mass': center_of_mass,
+        'atom_feature': feat_mat,
+        'moltree': moltree,
+        'neighbors': neighbor_dict
+    }
+    return data

utils/reconstruct.py ADDED Viewed

	@@ -0,0 +1,498 @@

+import numpy as np
+from rdkit.Chem import AllChem as Chem
+from rdkit import Geometry
+from openbabel import openbabel as ob
+from openbabel import pybel
+from scipy.spatial.distance import pdist
+from scipy.spatial.distance import squareform
+from .protein_ligand import ATOM_FAMILIES_ID
+class MolReconsError(Exception):
+    pass
+def reachable_r(a,b, seenbonds):
+    '''Recursive helper.'''
+    for nbr in ob.OBAtomAtomIter(a):
+        bond = a.GetBond(nbr).GetIdx()
+        if bond not in seenbonds:
+            seenbonds.add(bond)
+            if nbr == b:
+                return True
+            elif reachable_r(nbr,b,seenbonds):
+                return True
+    return False
+def reachable(a,b):
+    '''Return true if atom b is reachable from a without using the bond between them.'''
+    if a.GetExplicitDegree() == 1 or b.GetExplicitDegree() == 1:
+        return False #this is the _only_ bond for one atom
+    #otherwise do recursive traversal
+    seenbonds = set([a.GetBond(b).GetIdx()])
+    return reachable_r(a,b,seenbonds)
+def forms_small_angle(a,b,cutoff=45):
+    '''Return true if bond between a and b is part of a small angle
+    with a neighbor of a only.'''
+    for nbr in ob.OBAtomAtomIter(a):
+        if nbr != b:
+            degrees = b.GetAngle(a,nbr)
+            if degrees < cutoff:
+                return True
+    return False
+def make_obmol(xyz, atomic_numbers):
+    mol = ob.OBMol()
+    mol.BeginModify()
+    atoms = []
+    for xyz,t in zip(xyz, atomic_numbers):
+        x,y,z = xyz
+        # ch = struct.channels[t]
+        atom = mol.NewAtom()
+        atom.SetAtomicNum(t)
+        atom.SetVector(x,y,z)
+        atoms.append(atom)
+    return mol, atoms
+def connect_the_dots(mol, atoms, indicators, maxbond=4):
+    '''Custom implementation of ConnectTheDots.  This is similar to
+    OpenBabel's version, but is more willing to make long bonds
+    (up to maxbond long) to keep the molecule connected.  It also
+    attempts to respect atom type information from struct.
+    atoms and struct need to correspond in their order
+    Assumes no hydrogens or existing bonds.
+    '''
+    pt = Chem.GetPeriodicTable()
+    if len(atoms) == 0:
+        return
+    mol.BeginModify()
+    #just going to to do n^2 comparisons, can worry about efficiency later
+    coords = np.array([(a.GetX(),a.GetY(),a.GetZ()) for a in atoms])
+    dists = squareform(pdist(coords))
+    # types = [struct.channels[t].name for t in struct.c]
+    for (i,a) in enumerate(atoms):
+        for (j,b) in enumerate(atoms):
+            if a == b:
+                break
+            if dists[i,j] < 0.01:  #reduce from 0.4
+                continue #don't bond too close atoms
+            if dists[i,j] < maxbond:
+                flag = 0
+                if indicators[i][ATOM_FAMILIES_ID['Aromatic']] and indicators[j][ATOM_FAMILIES_ID['Aromatic']]:
+                    # print('Aromatic', ATOM_FAMILIES_ID['Aromatic'], indicators[i])
+                    flag = ob.OB_AROMATIC_BOND
+                # if 'Aromatic' in types[i] and 'Aromatic' in types[j]:
+                #     flag = ob.OB_AROMATIC_BOND
+                mol.AddBond(a.GetIdx(),b.GetIdx(),1,flag)
+    atom_maxb = {}
+    for (i,a) in enumerate(atoms):
+        #set max valance to the smallest max allowed by openbabel or rdkit
+        #since we want the molecule to be valid for both (rdkit is usually lower)
+        maxb = ob.GetMaxBonds(a.GetAtomicNum())
+        maxb = min(maxb,pt.GetDefaultValence(a.GetAtomicNum()))
+        if a.GetAtomicNum() == 16: # sulfone check
+            if count_nbrs_of_elem(a, 8) >= 2:
+                maxb = 6
+        # if indicators[i][ATOM_FAMILIES_ID['Donor']]:
+        #     maxb -= 1 #leave room for hydrogen
+        # if 'Donor' in types[i]:
+        #     maxb -= 1 #leave room for hydrogen
+        atom_maxb[a.GetIdx()] = maxb
+    #remove any impossible bonds between halogens
+    for bond in ob.OBMolBondIter(mol):
+        a1 = bond.GetBeginAtom()
+        a2 = bond.GetEndAtom()
+        if atom_maxb[a1.GetIdx()] == 1 and atom_maxb[a2.GetIdx()] == 1:
+            mol.DeleteBond(bond)
+    def get_bond_info(biter):
+        '''Return bonds sorted by their distortion'''
+        bonds = [b for b in biter]
+        binfo = []
+        for bond in bonds:
+            bdist = bond.GetLength()
+            #compute how far away from optimal we are
+            a1 = bond.GetBeginAtom()
+            a2 = bond.GetEndAtom()
+            ideal = ob.GetCovalentRad(a1.GetAtomicNum()) + ob.GetCovalentRad(a2.GetAtomicNum())
+            stretch = bdist-ideal
+            binfo.append((stretch,bdist,bond))
+        binfo.sort(reverse=True, key=lambda t: t[:2]) #most stretched bonds first
+        return binfo
+    #prioritize removing hypervalency causing bonds, do more valent
+    #constrained atoms first since their bonds introduce the most problems
+    #with reachability (e.g. oxygen)
+    # hypers = sorted([(atom_maxb[a.GetIdx()],a.GetExplicitValence() - atom_maxb[a.GetIdx()], a) for a in atoms],key=lambda aa: (aa[0],-aa[1]))
+    # for mb,diff,a in hypers:
+    #     if a.GetExplicitValence() <= atom_maxb[a.GetIdx()]:
+    #         continue
+    #     binfo = get_bond_info(ob.OBAtomBondIter(a))
+    #     for stretch,bdist,bond in binfo:
+    #         #can we remove this bond without disconnecting the molecule?
+    #         a1 = bond.GetBeginAtom()
+    #         a2 = bond.GetEndAtom()
+    #         #get right valence
+    #         if a1.GetExplicitValence() > atom_maxb[a1.GetIdx()] or \
+    #             a2.GetExplicitValence() > atom_maxb[a2.GetIdx()]:
+    #             #don't fragment the molecule
+    #             if not reachable(a1,a2):
+    #                 continue
+    #             mol.DeleteBond(bond)
+    #             if a.GetExplicitValence() <= atom_maxb[a.GetIdx()]:
+    #                 break #let nbr atoms choose what bonds to throw out
+    binfo = get_bond_info(ob.OBMolBondIter(mol))
+    #now eliminate geometrically poor bonds
+    for stretch,bdist,bond in binfo:
+        #can we remove this bond without disconnecting the molecule?
+        a1 = bond.GetBeginAtom()
+        a2 = bond.GetEndAtom()
+        #as long as we aren't disconnecting, let's remove things
+        #that are excessively far away (0.45 from ConnectTheDots)
+        #get bonds to be less than max allowed
+        #also remove tight angles, because that is what ConnectTheDots does
+        if stretch > 0.45 or forms_small_angle(a1,a2) or forms_small_angle(a2,a1):
+            #don't fragment the molecule
+            if not reachable(a1,a2):
+                continue
+            mol.DeleteBond(bond)
+    mol.EndModify()
+def convert_ob_mol_to_rd_mol(ob_mol,struct=None):
+    '''Convert OBMol to RDKit mol, fixing up issues'''
+    ob_mol.DeleteHydrogens()
+    n_atoms = ob_mol.NumAtoms()
+    rd_mol = Chem.RWMol()
+    rd_conf = Chem.Conformer(n_atoms)
+    for ob_atom in ob.OBMolAtomIter(ob_mol):
+        rd_atom = Chem.Atom(ob_atom.GetAtomicNum())
+        #TODO copy format charge
+        if ob_atom.IsAromatic() and ob_atom.IsInRing() and ob_atom.MemberOfRingSize() <= 6:
+            #don't commit to being aromatic unless rdkit will be okay with the ring status
+            #(this can happen if the atoms aren't fit well enough)
+            rd_atom.SetIsAromatic(True)
+        i = rd_mol.AddAtom(rd_atom)
+        ob_coords = ob_atom.GetVector()
+        x = ob_coords.GetX()
+        y = ob_coords.GetY()
+        z = ob_coords.GetZ()
+        rd_coords = Geometry.Point3D(x, y, z)
+        rd_conf.SetAtomPosition(i, rd_coords)
+    rd_mol.AddConformer(rd_conf)
+    for ob_bond in ob.OBMolBondIter(ob_mol):
+        i = ob_bond.GetBeginAtomIdx()-1
+        j = ob_bond.GetEndAtomIdx()-1
+        bond_order = ob_bond.GetBondOrder()
+        if bond_order == 1:
+            rd_mol.AddBond(i, j, Chem.BondType.SINGLE)
+        elif bond_order == 2:
+            rd_mol.AddBond(i, j, Chem.BondType.DOUBLE)
+        elif bond_order == 3:
+            rd_mol.AddBond(i, j, Chem.BondType.TRIPLE)
+        else:
+            raise Exception('unknown bond order {}'.format(bond_order))
+        if ob_bond.IsAromatic():
+            bond = rd_mol.GetBondBetweenAtoms (i,j)
+            bond.SetIsAromatic(True)
+    rd_mol = Chem.RemoveHs(rd_mol, sanitize=False)
+    pt = Chem.GetPeriodicTable()
+    #if double/triple bonds are connected to hypervalent atoms, decrement the order
+    positions = rd_mol.GetConformer().GetPositions()
+    nonsingles = []
+    for bond in rd_mol.GetBonds():
+        if bond.GetBondType() == Chem.BondType.DOUBLE or bond.GetBondType() == Chem.BondType.TRIPLE:
+            i = bond.GetBeginAtomIdx()
+            j = bond.GetEndAtomIdx()
+            dist = np.linalg.norm(positions[i]-positions[j])
+            nonsingles.append((dist,bond))
+    nonsingles.sort(reverse=True, key=lambda t: t[0])
+    for (d,bond) in nonsingles:
+        a1 = bond.GetBeginAtom()
+        a2 = bond.GetEndAtom()
+        if calc_valence(a1) > pt.GetDefaultValence(a1.GetAtomicNum()) or \
+           calc_valence(a2) > pt.GetDefaultValence(a2.GetAtomicNum()):
+            btype = Chem.BondType.SINGLE
+            if bond.GetBondType() == Chem.BondType.TRIPLE:
+                btype = Chem.BondType.DOUBLE
+            bond.SetBondType(btype)
+    for atom in rd_mol.GetAtoms():
+        #set nitrogens with 4 neighbors to have a charge
+        if atom.GetAtomicNum() == 7 and atom.GetDegree() == 4:
+            atom.SetFormalCharge(1)
+    rd_mol = Chem.AddHs(rd_mol,addCoords=True)
+    positions = rd_mol.GetConformer().GetPositions()
+    center = np.mean(positions[np.all(np.isfinite(positions),axis=1)],axis=0)
+    for atom in rd_mol.GetAtoms():
+        i = atom.GetIdx()
+        pos = positions[i]
+        if not np.all(np.isfinite(pos)):
+            #hydrogens on C fragment get set to nan (shouldn't, but they do)
+            rd_mol.GetConformer().SetAtomPosition(i,center)
+    try:
+        Chem.SanitizeMol(rd_mol,Chem.SANITIZE_ALL^Chem.SANITIZE_KEKULIZE)
+    except:
+        raise MolReconsError()
+    # try:
+    #     Chem.SanitizeMol(rd_mol,Chem.SANITIZE_ALL^Chem.SANITIZE_KEKULIZE)
+    # except: # mtr22 - don't assume mols will pass this
+    #     pass
+    #     # dkoes - but we want to make failures as rare as possible and should debug them
+    #     m = pybel.Molecule(ob_mol)
+    #     i = np.random.randint(1000000)
+    #     outname = 'bad%d.sdf'%i
+    #     print("WRITING",outname)
+    #     m.write('sdf',outname,overwrite=True)
+    #     pickle.dump(struct,open('bad%d.pkl'%i,'wb'))
+    #but at some point stop trying to enforce our aromaticity -
+    #openbabel and rdkit have different aromaticity models so they
+    #won't always agree.  Remove any aromatic bonds to non-aromatic atoms
+    for bond in rd_mol.GetBonds():
+        a1 = bond.GetBeginAtom()
+        a2 = bond.GetEndAtom()
+        if bond.GetIsAromatic():
+            if not a1.GetIsAromatic() or not a2.GetIsAromatic():
+                bond.SetIsAromatic(False)
+        elif a1.GetIsAromatic() and a2.GetIsAromatic():
+            bond.SetIsAromatic(True)
+    return rd_mol
+def calc_valence(rdatom):
+    '''Can call GetExplicitValence before sanitize, but need to
+    know this to fix up the molecule to prevent sanitization failures'''
+    cnt = 0.0
+    for bond in rdatom.GetBonds():
+        cnt += bond.GetBondTypeAsDouble()
+    return cnt
+def count_nbrs_of_elem(atom, atomic_num):
+    '''
+    Count the number of neighbors atoms
+    of atom with the given atomic_num.
+    '''
+    count = 0
+    for nbr in ob.OBAtomAtomIter(atom):
+        if nbr.GetAtomicNum() == atomic_num:
+            count += 1
+    return count
+def fixup(atoms, mol, indicators):
+    '''Set atom properties to match channel.  Keep doing this
+    to beat openbabel over the head with what we want to happen.'''
+    mol.SetAromaticPerceived(True)  #avoid perception
+    for i, atom in enumerate(atoms):
+        # ch = struct.channels[t]
+        ind = indicators[i]
+        if ind[ATOM_FAMILIES_ID['Aromatic']]:
+            atom.SetAromatic(True)
+            atom.SetHyb(2)
+        # if ind[ATOM_FAMILIES_ID['Donor']]:
+        #     if atom.GetExplicitDegree() == atom.GetHvyDegree():
+        #         if atom.GetHvyDegree() == 1 and atom.GetAtomicNum() == 7:
+        #             atom.SetImplicitHCount(2)
+        #         else:
+        #             atom.SetImplicitHCount(1)
+        # elif ind[ATOM_FAMILIES_ID['Acceptor']]: # NOT AcceptorDonor because of else
+        #     atom.SetImplicitHCount(0)
+        if (atom.GetAtomicNum() in (7, 8)) and atom.IsInRing():     # Nitrogen, Oxygen
+            #this is a little iffy, ommitting until there is more evidence it is a net positive
+            #we don't have aromatic types for nitrogen, but if it
+            #is in a ring with aromatic carbon mark it aromatic as well
+            acnt = 0
+            for nbr in ob.OBAtomAtomIter(atom):
+                if nbr.IsAromatic():
+                    acnt += 1
+            if acnt > 1:
+                atom.SetAromatic(True)
+def raw_obmol_from_generated(data):
+    xyz = data.ligand_context_pos.clone().cpu().tolist()
+    atomic_nums = data.ligand_context_element.clone().cpu().tolist()
+    # indicators = data.ligand_context_feature_full[:, -len(ATOM_FAMILIES_ID):].clone().cpu().bool().tolist()
+    mol, atoms = make_obmol(xyz, atomic_nums)
+    return mol, atoms
+UPGRADE_BOND_ORDER = {Chem.BondType.SINGLE:Chem.BondType.DOUBLE, Chem.BondType.DOUBLE:Chem.BondType.TRIPLE}
+def postprocess_rd_mol_1(rdmol):
+    rdmol = Chem.RemoveHs(rdmol)
+    # Construct bond nbh list
+    nbh_list = {}
+    for bond in rdmol.GetBonds():
+        begin, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
+        if begin not in nbh_list: nbh_list[begin] = [end]
+        else: nbh_list[begin].append(end)
+        if end not in nbh_list: nbh_list[end] = [begin]
+        else: nbh_list[end].append(begin)
+    # Fix missing bond-order
+    for atom in rdmol.GetAtoms():
+        idx = atom.GetIdx()
+        num_radical = atom.GetNumRadicalElectrons()
+        if num_radical > 0:
+            for j in nbh_list[idx]:
+                if j <= idx: continue
+                nb_atom = rdmol.GetAtomWithIdx(j)
+                nb_radical = nb_atom.GetNumRadicalElectrons()
+                if nb_radical > 0:
+                    bond = rdmol.GetBondBetweenAtoms(idx, j)
+                    bond.SetBondType(UPGRADE_BOND_ORDER[bond.GetBondType()])
+                    nb_atom.SetNumRadicalElectrons(nb_radical - 1)
+                    num_radical -= 1
+            atom.SetNumRadicalElectrons(num_radical)
+        num_radical = atom.GetNumRadicalElectrons()
+        if num_radical > 0:
+            atom.SetNumRadicalElectrons(0)
+            num_hs = atom.GetNumExplicitHs()
+            atom.SetNumExplicitHs(num_hs + num_radical)
+    return rdmol
+def postprocess_rd_mol_2(rdmol):
+    rdmol_edit = Chem.RWMol(rdmol)
+    ring_info = rdmol.GetRingInfo()
+    ring_info.AtomRings()
+    rings = [set(r) for r in ring_info.AtomRings()]
+    for i, ring_a in enumerate(rings):
+        if len(ring_a) == 3:
+            non_carbon = []
+            atom_by_symb = {}
+            for atom_idx in ring_a:
+                symb = rdmol.GetAtomWithIdx(atom_idx).GetSymbol()
+                if symb != 'C':
+                    non_carbon.append(atom_idx)
+                if symb not in atom_by_symb:
+                    atom_by_symb[symb] = [atom_idx]
+                else:
+                    atom_by_symb[symb].append(atom_idx)
+            if len(non_carbon) == 2:
+                rdmol_edit.RemoveBond(*non_carbon)
+            if 'O' in atom_by_symb and len(atom_by_symb['O']) == 2:
+                rdmol_edit.RemoveBond(*atom_by_symb['O'])
+                rdmol_edit.GetAtomWithIdx(atom_by_symb['O'][0]).SetNumExplicitHs(
+                    rdmol_edit.GetAtomWithIdx(atom_by_symb['O'][0]).GetNumExplicitHs() + 1
+                )
+                rdmol_edit.GetAtomWithIdx(atom_by_symb['O'][1]).SetNumExplicitHs(
+                    rdmol_edit.GetAtomWithIdx(atom_by_symb['O'][1]).GetNumExplicitHs() + 1
+                )
+    rdmol = rdmol_edit.GetMol()
+    for atom in rdmol.GetAtoms():
+        if atom.GetFormalCharge() > 0:
+            atom.SetFormalCharge(0)
+    return rdmol
+def reconstruct_from_generated(data):
+    xyz = data.ligand_context_pos.clone().cpu().tolist()
+    atomic_nums = data.ligand_context_element.clone().cpu().tolist()
+    indicators = data.ligand_context_feature_full[:, -len(ATOM_FAMILIES_ID):].clone().cpu().bool().tolist()
+    mol, atoms = make_obmol(xyz, atomic_nums)
+    fixup(atoms, mol, indicators)
+    connect_the_dots(mol, atoms, indicators, 2)
+    fixup(atoms, mol, indicators)
+    mol.EndModify()
+    fixup(atoms, mol, indicators)
+    mol.AddPolarHydrogens()
+    mol.PerceiveBondOrders()
+    fixup(atoms, mol, indicators)
+    for (i,a) in enumerate(atoms):
+        ob.OBAtomAssignTypicalImplicitHydrogens(a)
+    fixup(atoms, mol, indicators)
+    mol.AddHydrogens()
+    fixup(atoms, mol, indicators)
+    #make rings all aromatic if majority of carbons are aromatic
+    for ring in ob.OBMolRingIter(mol):
+        if 5 <= ring.Size() <= 6:
+            carbon_cnt = 0
+            aromatic_ccnt = 0
+            for ai in ring._path:
+                a = mol.GetAtom(ai)
+                if a.GetAtomicNum() == 6:
+                    carbon_cnt += 1
+                    if a.IsAromatic():
+                        aromatic_ccnt += 1
+            if aromatic_ccnt >= carbon_cnt/2 and aromatic_ccnt != ring.Size():
+                #set all ring atoms to be aromatic
+                for ai in ring._path:
+                    a = mol.GetAtom(ai)
+                    a.SetAromatic(True)
+    #bonds must be marked aromatic for smiles to match
+    for bond in ob.OBMolBondIter(mol):
+        a1 = bond.GetBeginAtom()
+        a2 = bond.GetEndAtom()
+        if a1.IsAromatic() and a2.IsAromatic():
+            bond.SetAromatic(True)
+    mol.PerceiveBondOrders()
+    rd_mol = convert_ob_mol_to_rd_mol(mol)
+    # Post-processing
+    rd_mol = postprocess_rd_mol_1(rd_mol)
+    rd_mol = postprocess_rd_mol_2(rd_mol)
+    return rd_mol

utils/sascorer.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from __future__ import print_function
+from rdkit import Chem
+from rdkit.Chem import rdMolDescriptors
+from rdkit.six.moves import cPickle
+from rdkit.six import iteritems
+import math
+from collections import defaultdict
+import os.path as op
+_fscores = None
+def readFragmentScores(name='fpscores'):
+  import gzip
+  global _fscores
+  # generate the full path filename:
+  if name == "fpscores":
+    name = op.join(op.dirname(__file__), name)
+  _fscores = cPickle.load(gzip.open('%s.pkl.gz' % name))
+  outDict = {}
+  for i in _fscores:
+    for j in range(1, len(i)):
+      outDict[i[j]] = float(i[0])
+  _fscores = outDict
+def numBridgeheadsAndSpiro(mol, ri=None):
+  nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol)
+  nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
+  return nBridgehead, nSpiro
+def calculateScore(m):
+  if _fscores is None:
+    readFragmentScores()
+  # fragment score
+  fp = rdMolDescriptors.GetMorganFingerprint(m,
+                                             2)  #<- 2 is the *radius* of the circular fingerprint
+  fps = fp.GetNonzeroElements()
+  score1 = 0.
+  nf = 0
+  for bitId, v in iteritems(fps):
+    nf += v
+    sfp = bitId
+    score1 += _fscores.get(sfp, -4) * v
+  score1 /= nf
+  # features score
+  nAtoms = m.GetNumAtoms()
+  nChiralCenters = len(Chem.FindMolChiralCenters(m, includeUnassigned=True))
+  ri = m.GetRingInfo()
+  nBridgeheads, nSpiro = numBridgeheadsAndSpiro(m, ri)
+  nMacrocycles = 0
+  for x in ri.AtomRings():
+    if len(x) > 8:
+      nMacrocycles += 1
+  sizePenalty = nAtoms**1.005 - nAtoms
+  stereoPenalty = math.log10(nChiralCenters + 1)
+  spiroPenalty = math.log10(nSpiro + 1)
+  bridgePenalty = math.log10(nBridgeheads + 1)
+  macrocyclePenalty = 0.
+  # ---------------------------------------
+  # This differs from the paper, which defines:
+  #  macrocyclePenalty = math.log10(nMacrocycles+1)
+  # This form generates better results when 2 or more macrocycles are present
+  if nMacrocycles > 0:
+    macrocyclePenalty = math.log10(2)
+  score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty
+  # correction for the fingerprint density
+  # not in the original publication, added in version 1.1
+  # to make highly symmetrical molecules easier to synthetise
+  score3 = 0.
+  if nAtoms > len(fps):
+    score3 = math.log(float(nAtoms) / len(fps)) * .5
+  sascore = score1 + score2 + score3
+  # need to transform "raw" value into scale between 1 and 10
+  min = -4.0
+  max = 2.5
+  sascore = 11. - (sascore - min + 1) / (max - min) * 9.
+  # smooth the 10-end
+  if sascore > 8.:
+    sascore = 8. + math.log(sascore + 1. - 9.)
+  if sascore > 10.:
+    sascore = 10.0
+  elif sascore < 1.:
+    sascore = 1.0
+  return sascore
+def processMols(mols):
+  print('smiles\tName\tsa_score')
+  for i, m in enumerate(mols):
+    if m is None:
+      continue
+    s = calculateScore(m)
+    smiles = Chem.MolToSmiles(m)
+    print(smiles + "\t" + m.GetProp('_Name') + "\t%3f" % s)
+if __name__ == '__main__':
+  import sys, time
+  t1 = time.time()
+  readFragmentScores("fpscores")
+  t2 = time.time()
+  suppl = Chem.SmilesMolSupplier(sys.argv[1])
+  t3 = time.time()
+  processMols(suppl)
+  t4 = time.time()
+  print('Reading took %.2f seconds. Calculating took %.2f seconds' % ((t2 - t1), (t4 - t3)),
+        file=sys.stderr)
+#
+#  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
+#  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
+#       with the distribution.
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
+#       products derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+def compute_sa_score(rdmol):
+    rdmol = Chem.MolFromSmiles(Chem.MolToSmiles(rdmol))
+    sa = calculateScore(rdmol)
+    sa = round((10-sa)/9,2)
+    return sa

utils/similarity.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import numpy as np
+from rdkit import Chem, DataStructs
+def tanimoto_sim(mol, ref):
+    fp1 = Chem.RDKFingerprint(ref)
+    fp2 = Chem.RDKFingerprint(mol)
+    return DataStructs.TanimotoSimilarity(fp1,fp2)
+def tanimoto_sim_N_to_1(mols, ref):
+    sim = [tanimoto_sim(m, ref) for m in mols]
+    return sim
+def batched_number_of_rings(mols):
+    n = []
+    for m in mols:
+        n.append(Chem.rdMolDescriptors.CalcNumRings(m))
+    return np.array(n)

utils/train.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import copy
+import warnings
+import numpy as np
+import torch
+import torch.nn as nn
+from torch_geometric.data import Data, Batch
+from .warmup import GradualWarmupScheduler
+#customize exp lr scheduler with min lr
+class ExponentialLR_with_minLr(torch.optim.lr_scheduler.ExponentialLR):
+    def __init__(self, optimizer, gamma, min_lr=1e-4, last_epoch=-1, verbose=False):
+        self.gamma = gamma
+        self.min_lr = min_lr
+        super(ExponentialLR_with_minLr, self).__init__(optimizer, gamma, last_epoch, verbose)
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", UserWarning)
+        if self.last_epoch == 0:
+            return self.base_lrs
+        return [max(group['lr'] * self.gamma, self.min_lr)
+                for group in self.optimizer.param_groups]
+    def _get_closed_form_lr(self):
+        return [max(base_lr * self.gamma ** self.last_epoch, self.min_lr)
+                for base_lr in self.base_lrs]
+def repeat_data(data: Data, num_repeat) -> Batch:
+    datas = [copy.deepcopy(data) for i in range(num_repeat)]
+    return Batch.from_data_list(datas)
+def repeat_batch(batch: Batch, num_repeat) -> Batch:
+    datas = batch.to_data_list()
+    new_data = []
+    for i in range(num_repeat):
+        new_data += copy.deepcopy(datas)
+    return Batch.from_data_list(new_data)
+def inf_iterator(iterable):
+    iterator = iterable.__iter__()
+    while True:
+        try:
+            yield iterator.__next__()
+        except StopIteration:
+            iterator = iterable.__iter__()
+def get_optimizer(cfg, model):
+    if cfg.type == 'adam':
+        return torch.optim.Adam(
+            model.parameters(),
+            lr=cfg.lr,
+            weight_decay=cfg.weight_decay,
+            betas=(cfg.beta1, cfg.beta2, )
+        )
+    else:
+        raise NotImplementedError('Optimizer not supported: %s' % cfg.type)
+def get_scheduler(cfg, optimizer):
+    if cfg.type == 'plateau':
+        return torch.optim.lr_scheduler.ReduceLROnPlateau(
+            optimizer,
+            factor=cfg.factor,
+            patience=cfg.patience,
+            min_lr=cfg.min_lr
+        )
+    elif cfg.type == 'warmup_plateau':
+        return GradualWarmupScheduler(
+            optimizer,
+            multiplier = cfg.multiplier,
+            total_epoch = cfg.total_epoch,
+            after_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+                optimizer,
+                factor=cfg.factor,
+                patience=cfg.patience,
+                min_lr=cfg.min_lr
+            )
+        )
+    elif cfg.type == 'expmin':
+        return ExponentialLR_with_minLr(
+            optimizer,
+            gamma=cfg.factor,
+            min_lr=cfg.min_lr,
+        )
+    elif cfg.type == 'expmin_milestone':
+        gamma = np.exp(np.log(cfg.factor) / cfg.milestone)
+        return ExponentialLR_with_minLr(
+            optimizer,
+            gamma=gamma,
+            min_lr=cfg.min_lr,
+        )
+    else:
+        raise NotImplementedError('Scheduler not supported: %s' % cfg.type)

utils/transforms.py ADDED Viewed

	@@ -0,0 +1,684 @@

+import sys
+sys.path.append("..")
+import copy
+import os
+import random
+import torch
+import torch.nn.functional as F
+import numpy as np
+from copy import deepcopy
+from torch_geometric.transforms import Compose
+from torch_geometric.nn.pool import knn_graph
+from torch_geometric.utils.subgraph import subgraph
+from torch_geometric.utils.num_nodes import maybe_num_nodes
+from torch_geometric.data import Data, Batch
+from torch_scatter import scatter_add
+from rdkit import Chem
+from rdkit.Chem import Descriptors
+from rdkit.Chem import AllChem
+from .data import ProteinLigandData
+from .protein_ligand import ATOM_FAMILIES
+from .chemutils import enumerate_assemble, list_filter, rand_rotate
+from .dihedral_utils import batch_dihedrals
+# allowable node and edge features
+allowable_features = {
+    'possible_atomic_num_list': list(range(1, 119)),
+    'possible_formal_charge_list': [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5],
+    'possible_chirality_list': [
+        Chem.rdchem.ChiralType.CHI_UNSPECIFIED,
+        Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW,
+        Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW,
+        Chem.rdchem.ChiralType.CHI_OTHER
+    ],
+    'possible_hybridization_list': [
+        Chem.rdchem.HybridizationType.S,
+        Chem.rdchem.HybridizationType.SP, Chem.rdchem.HybridizationType.SP2,
+        Chem.rdchem.HybridizationType.SP3, Chem.rdchem.HybridizationType.SP3D,
+        Chem.rdchem.HybridizationType.SP3D2, Chem.rdchem.HybridizationType.UNSPECIFIED
+    ],
+    'possible_numH_list': [0, 1, 2, 3, 4, 5, 6, 7, 8],
+    'possible_implicit_valence_list': [0, 1, 2, 3, 4, 5, 6],
+    'possible_degree_list': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+    'possible_bonds': [
+        Chem.rdchem.BondType.SINGLE,
+        Chem.rdchem.BondType.DOUBLE,
+        Chem.rdchem.BondType.TRIPLE,
+        Chem.rdchem.BondType.AROMATIC
+    ],
+    'possible_bond_dirs': [  # only for double bond stereo information
+        Chem.rdchem.BondDir.NONE,
+        Chem.rdchem.BondDir.ENDUPRIGHT,
+        Chem.rdchem.BondDir.ENDDOWNRIGHT
+    ]
+}
+def mol_to_graph_data_obj_simple(mol):
+    """
+    Converts rdkit mol object to graph Data object required by the pytorch
+    geometric package. NB: Uses simplified atom and bond features, and represent
+    as indices
+    :param mol: rdkit mol object
+    :return: graph data object with the attributes: x, edge_index, edge_attr
+    """
+    # atoms
+    num_atom_features = 2  # atom type,  chirality tag
+    atom_features_list = []
+    for atom in mol.GetAtoms():
+        atom_feature = [allowable_features['possible_atomic_num_list'].index(
+            atom.GetAtomicNum())] + [allowable_features[
+                                         'possible_chirality_list'].index(atom.GetChiralTag())]
+        atom_features_list.append(atom_feature)
+    x = torch.tensor(np.array(atom_features_list), dtype=torch.long)
+    # bonds
+    num_bond_features = 2  # bond type, bond direction
+    if len(mol.GetBonds()) > 0:  # mol has bonds
+        edges_list = []
+        edge_features_list = []
+        for bond in mol.GetBonds():
+            i = bond.GetBeginAtomIdx()
+            j = bond.GetEndAtomIdx()
+            edge_feature = [allowable_features['possible_bonds'].index(
+                bond.GetBondType())] + [allowable_features[
+                'possible_bond_dirs'].index(
+                bond.GetBondDir())]
+            edges_list.append((i, j))
+            edge_features_list.append(edge_feature)
+            edges_list.append((j, i))
+            edge_features_list.append(edge_feature)
+        # data.edge_index: Graph connectivity in COO format with shape [2, num_edges]
+        edge_index = torch.tensor(np.array(edges_list).T, dtype=torch.long)
+        # data.edge_attr: Edge feature matrix with shape [num_edges, num_edge_features]
+        edge_attr = torch.tensor(np.array(edge_features_list),
+                                 dtype=torch.long)
+    else:  # mol has no bonds
+        edge_index = torch.empty((2, 0), dtype=torch.long)
+        edge_attr = torch.empty((0, num_bond_features), dtype=torch.long)
+    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
+    return data
+class RefineData(object):
+    def __init__(self):
+        super().__init__()
+    def __call__(self, data):
+        # delete H atom of pocket
+        protein_element = data.protein_element
+        is_H_protein = (protein_element == 1)
+        if torch.sum(is_H_protein) > 0:
+            not_H_protein = ~is_H_protein
+            data.protein_atom_name = list(compress(data.protein_atom_name, not_H_protein))
+            data.protein_atom_to_aa_type = data.protein_atom_to_aa_type[not_H_protein]
+            data.protein_element = data.protein_element[not_H_protein]
+            data.protein_is_backbone = data.protein_is_backbone[not_H_protein]
+            data.protein_pos = data.protein_pos[not_H_protein]
+        # delete H atom of ligand
+        ligand_element = data.ligand_element
+        is_H_ligand = (ligand_element == 1)
+        if torch.sum(is_H_ligand) > 0:
+            not_H_ligand = ~is_H_ligand
+            data.ligand_atom_feature = data.ligand_atom_feature[not_H_ligand]
+            data.ligand_element = data.ligand_element[not_H_ligand]
+            data.ligand_pos = data.ligand_pos[not_H_ligand]
+            # nbh
+            index_atom_H = torch.nonzero(is_H_ligand)[:, 0]
+            index_changer = -np.ones(len(not_H_ligand), dtype=np.int64)
+            index_changer[not_H_ligand] = np.arange(torch.sum(not_H_ligand))
+            new_nbh_list = [value for ind_this, value in zip(not_H_ligand, data.ligand_nbh_list.values()) if ind_this]
+            data.ligand_nbh_list = {i: [index_changer[node] for node in neigh if node not in index_atom_H] for i, neigh
+                                    in enumerate(new_nbh_list)}
+            # bond
+            ind_bond_with_H = np.array([(bond_i in index_atom_H) | (bond_j in index_atom_H) for bond_i, bond_j in
+                                        zip(*data.ligand_bond_index)])
+            ind_bond_without_H = ~ind_bond_with_H
+            old_ligand_bond_index = data.ligand_bond_index[:, ind_bond_without_H]
+            data.ligand_bond_index = torch.tensor(index_changer)[old_ligand_bond_index]
+            data.ligand_bond_type = data.ligand_bond_type[ind_bond_without_H]
+        return data
+class FocalBuilder(object):
+    def __init__(self, close_threshold=0.8, max_bond_length=2.4):
+        self.close_threshold = close_threshold
+        self.max_bond_length = max_bond_length
+        super().__init__()
+    def __call__(self, data: ProteinLigandData):
+        # ligand_context_pos = data.ligand_context_pos
+        # ligand_pos = data.ligand_pos
+        ligand_masked_pos = data.ligand_masked_pos
+        protein_pos = data.protein_pos
+        context_idx = data.context_idx
+        masked_idx = data.masked_idx
+        old_bond_index = data.ligand_bond_index
+        # old_bond_types = data.ligand_bond_type  # type: 0, 1, 2
+        has_unmask_atoms = context_idx.nelement() > 0
+        if has_unmask_atoms:
+            # # get bridge bond index (mask-context bond)
+            ind_edge_index_candidate = [
+                (context_node in context_idx) and (mask_node in masked_idx)
+                for mask_node, context_node in zip(*old_bond_index)
+            ]  # the mask-context order is right
+            bridge_bond_index = old_bond_index[:, ind_edge_index_candidate]
+            # candidate_bond_types = old_bond_types[idx_edge_index_candidate]
+            idx_generated_in_whole_ligand = bridge_bond_index[0]
+            idx_focal_in_whole_ligand = bridge_bond_index[1]
+            index_changer_masked = torch.zeros(masked_idx.max() + 1, dtype=torch.int64)
+            index_changer_masked[masked_idx] = torch.arange(len(masked_idx))
+            idx_generated_in_ligand_masked = index_changer_masked[idx_generated_in_whole_ligand]
+            pos_generate = ligand_masked_pos[idx_generated_in_ligand_masked]
+            data.idx_generated_in_ligand_masked = idx_generated_in_ligand_masked
+            data.pos_generate = pos_generate
+            index_changer_context = torch.zeros(context_idx.max() + 1, dtype=torch.int64)
+            index_changer_context[context_idx] = torch.arange(len(context_idx))
+            idx_focal_in_ligand_context = index_changer_context[idx_focal_in_whole_ligand]
+            idx_focal_in_compose = idx_focal_in_ligand_context  # if ligand_context was not before protein in the compose, this was not correct
+            data.idx_focal_in_compose = idx_focal_in_compose
+            data.idx_protein_all_mask = torch.empty(0, dtype=torch.long)  # no use if has context
+            data.y_protein_frontier = torch.empty(0, dtype=torch.bool)  # no use if has context
+        else:  # # the initial atom. surface atoms between ligand and protein
+            assign_index = radius(x=ligand_masked_pos, y=protein_pos, r=4., num_workers=16)
+            if assign_index.size(1) == 0:
+                dist = torch.norm(data.protein_pos.unsqueeze(1) - data.ligand_masked_pos.unsqueeze(0), p=2, dim=-1)
+                assign_index = torch.nonzero(dist <= torch.min(dist) + 1e-5)[0:1].transpose(0, 1)
+            idx_focal_in_protein = assign_index[0]
+            data.idx_focal_in_compose = idx_focal_in_protein  # no ligand context, so all composes are protein atoms
+            data.pos_generate = ligand_masked_pos[assign_index[1]]
+            data.idx_generated_in_ligand_masked = torch.unique(assign_index[1])  # for real of the contractive transform
+            data.idx_protein_all_mask = data.idx_protein_in_compose  # for input of initial frontier prediction
+            y_protein_frontier = torch.zeros_like(data.idx_protein_all_mask,
+                                                  dtype=torch.bool)  # for label of initial frontier prediction
+            y_protein_frontier[torch.unique(idx_focal_in_protein)] = True
+            data.y_protein_frontier = y_protein_frontier
+        # generate not positions: around pos_focal ( with `max_bond_length` distance) but not close to true generated within `close_threshold`
+        # pos_focal = ligand_context_pos[idx_focal_in_ligand_context]
+        # pos_notgenerate = pos_focal + torch.randn_like(pos_focal) * self.max_bond_length  / 2.4
+        # dist = torch.norm(pos_generate - pos_notgenerate, p=2, dim=-1)
+        # ind_close = (dist < self.close_threshold)
+        # while ind_close.any():
+        #     new_pos_notgenerate = pos_focal[ind_close] + torch.randn_like(pos_focal[ind_close]) * self.max_bond_length  / 2.3
+        #     dist[ind_close] = torch.norm(pos_generate[ind_close] - new_pos_notgenerate, p=2, dim=-1)
+        #     pos_notgenerate[ind_close] = new_pos_notgenerate
+        #     ind_close = (dist < self.close_threshold)
+        # data.pos_notgenerate = pos_notgenerate
+        return data
+class AtomComposer(object):
+    def __init__(self, protein_dim, ligand_dim, knn):
+        super().__init__()
+        self.protein_dim = protein_dim
+        self.ligand_dim = ligand_dim
+        self.knn = knn  # knn of compose atoms
+    def __call__(self, data: ProteinLigandData):
+        # fetch ligand context and protein from data
+        ligand_context_pos = data['ligand_context_pos']
+        ligand_context_feature_full = data['ligand_context_feature_full']
+        protein_pos = data['protein_pos']
+        protein_atom_feature = data['protein_atom_feature']
+        len_ligand_ctx = len(ligand_context_pos)
+        len_protein = len(protein_pos)
+        # compose ligand context and protein. save idx of them in compose
+        data['compose_pos'] = torch.cat([ligand_context_pos, protein_pos], dim=0)
+        len_compose = len_ligand_ctx + len_protein
+        ligand_context_feature_full_expand = torch.cat([
+            ligand_context_feature_full,
+            torch.zeros([len_ligand_ctx, self.protein_dim - self.ligand_dim], dtype=torch.long)
+        ], dim=1)
+        data['compose_feature'] = torch.cat([ligand_context_feature_full_expand, protein_atom_feature], dim=0)
+        data['idx_ligand_ctx_in_compose'] = torch.arange(len_ligand_ctx, dtype=torch.long)  # can be delete
+        data['idx_protein_in_compose'] = torch.arange(len_protein, dtype=torch.long) + len_ligand_ctx  # can be delete
+        # build knn graph and bond type
+        data = self.get_knn_graph(data, self.knn, len_ligand_ctx, len_compose, num_workers=16)
+        return data
+    @staticmethod
+    def get_knn_graph(data: ProteinLigandData, knn, len_ligand_ctx, len_compose, num_workers=1, ):
+        data['compose_knn_edge_index'] = knn_graph(data['compose_pos'], knn, flow='target_to_source', num_workers=num_workers)
+        id_compose_edge = data['compose_knn_edge_index'][0,
+                          :len_ligand_ctx * knn] * len_compose + data['compose_knn_edge_index'][1, :len_ligand_ctx * knn]
+        id_ligand_ctx_edge = data['ligand_context_bond_index'][0] * len_compose + data['ligand_context_bond_index'][1]
+        idx_edge = [torch.nonzero(id_compose_edge == id_) for id_ in id_ligand_ctx_edge]
+        idx_edge = torch.tensor([a.squeeze() if len(a) > 0 else torch.tensor(-1) for a in idx_edge], dtype=torch.long)
+        data['compose_knn_edge_type'] = torch.zeros(len(data['compose_knn_edge_index'][0]),
+                                                 dtype=torch.long)  # for encoder edge embedding
+        data['compose_knn_edge_type'][idx_edge[idx_edge >= 0]] = data['ligand_context_bond_type'][idx_edge >= 0]
+        data['compose_knn_edge_feature'] = torch.cat([
+            torch.ones([len(data['compose_knn_edge_index'][0]), 1], dtype=torch.long),
+            torch.zeros([len(data['compose_knn_edge_index'][0]), 3], dtype=torch.long),
+        ], dim=-1)
+        data['compose_knn_edge_feature'][idx_edge[idx_edge >= 0]] = F.one_hot(data['ligand_context_bond_type'][idx_edge >= 0],
+                                                                           num_classes=4)  # 0 (1,2,3)-onehot
+        return data
+class FeaturizeProteinAtom(object):
+    def __init__(self):
+        super().__init__()
+        # self.atomic_numbers = torch.LongTensor([1, 6, 7, 8, 16, 34])    # H, C, N, O, S, Se
+        self.atomic_numbers = torch.LongTensor([6, 7, 8, 16, 34])  # H, C, N, O, S, Se
+        self.max_num_aa = 20
+    @property
+    def feature_dim(self):
+        return self.atomic_numbers.size(0) + self.max_num_aa + 1
+    def __call__(self, data: ProteinLigandData):
+        element = data['protein_element'].view(-1, 1) == self.atomic_numbers.view(1, -1)  # (N_atoms, N_elements)
+        amino_acid = F.one_hot(data['protein_atom_to_aa_type'], num_classes=self.max_num_aa)
+        is_backbone = data['protein_is_backbone'].view(-1, 1).long()
+        x = torch.cat([element, amino_acid, is_backbone], dim=-1)
+        data['protein_atom_feature'] = x
+        return data
+class FeaturizeLigandAtom(object):
+    def __init__(self):
+        super().__init__()
+        # self.atomic_numbers = torch.LongTensor([1,6,7,8,9,15,16,17])  # H C N O F P S Cl
+        self.atomic_numbers = torch.LongTensor([6, 7, 8, 9, 15, 16, 17])  # C N O F P S Cl
+    @property
+    def num_properties(self):
+        return len(ATOM_FAMILIES)
+    @property
+    def feature_dim(self):
+        return self.atomic_numbers.size(0) + len(ATOM_FAMILIES)
+    def __call__(self, data: ProteinLigandData):
+        element = data['ligand_element'].view(-1, 1) == self.atomic_numbers.view(1, -1)  # (N_atoms, N_elements)
+        x = torch.cat([element, data['ligand_atom_feature']], dim=-1)
+        data['ligand_atom_feature_full'] = x
+        return data
+class FeaturizeLigandBond(object):
+    def __init__(self):
+        super().__init__()
+    def __call__(self, data: ProteinLigandData):
+        data['ligand_bond_feature'] = F.one_hot((data['ligand_bond_type'] - 1)%3, num_classes=3)  # (1,2,3) to (0,1,2)-onehot
+        neighbor_dict = {}
+        # used in rotation angle prediction
+        mol = data['moltree'].mol
+        for i, atom in enumerate(mol.GetAtoms()):
+            neighbor_dict[i] = [n.GetIdx() for n in atom.GetNeighbors()]
+        data['ligand_neighbors'] = neighbor_dict
+        return data
+class LigandCountNeighbors(object):
+    @staticmethod
+    def count_neighbors(edge_index, symmetry, valence=None, num_nodes=None):
+        assert symmetry == True, 'Only support symmetrical edges.'
+        if num_nodes is None:
+            num_nodes = maybe_num_nodes(edge_index)
+        if valence is None:
+            valence = torch.ones([edge_index.size(1)], device=edge_index.device)
+        valence = valence.view(edge_index.size(1))
+        return scatter_add(valence, index=edge_index[0], dim=0, dim_size=num_nodes).long()
+    def __init__(self):
+        super().__init__()
+    def __call__(self, data):
+        data['ligand_num_neighbors'] = self.count_neighbors(
+            data['ligand_bond_index'],
+            symmetry=True,
+            num_nodes=data['ligand_element'].size(0),
+        )
+        data['ligand_atom_valence'] = self.count_neighbors(
+            data['ligand_bond_index'],
+            symmetry=True,
+            valence=data['ligand_bond_type'],
+            num_nodes=data['ligand_element'].size(0),
+        )
+        return data
+class LigandRandomMask(object):
+    def __init__(self, min_ratio=0.0, max_ratio=1.2, min_num_masked=1, min_num_unmasked=0):
+        super().__init__()
+        self.min_ratio = min_ratio
+        self.max_ratio = max_ratio
+        self.min_num_masked = min_num_masked
+        self.min_num_unmasked = min_num_unmasked
+    def __call__(self, data: ProteinLigandData):
+        ratio = np.clip(random.uniform(self.min_ratio, self.max_ratio), 0.0, 1.0)
+        num_atoms = data.ligand_element.size(0)
+        num_masked = int(num_atoms * ratio)
+        if num_masked < self.min_num_masked:
+            num_masked = self.min_num_masked
+        if (num_atoms - num_masked) < self.min_num_unmasked:
+            num_masked = num_atoms - self.min_num_unmasked
+        idx = np.arange(num_atoms)
+        np.random.shuffle(idx)
+        idx = torch.LongTensor(idx)
+        masked_idx = idx[:num_masked]
+        context_idx = idx[num_masked:]
+        data.ligand_masked_element = data.ligand_element[masked_idx]
+        data.ligand_masked_feature = data.ligand_atom_feature[masked_idx]  # For Prediction
+        data.ligand_masked_pos = data.ligand_pos[masked_idx]
+        data.ligand_context_element = data.ligand_element[context_idx]
+        data.ligand_context_feature_full = data.ligand_atom_feature_full[context_idx]  # For Input
+        data.ligand_context_pos = data.ligand_pos[context_idx]
+        data.ligand_context_bond_index, data.ligand_context_bond_feature = subgraph(
+            context_idx,
+            data.ligand_bond_index,
+            edge_attr=data.ligand_bond_feature,
+            relabel_nodes=True,
+        )
+        data.ligand_context_num_neighbors = LigandCountNeighbors.count_neighbors(
+            data.ligand_context_bond_index,
+            symmetry=True,
+            num_nodes=context_idx.size(0),
+        )
+        # print(context_idx)
+        # print(data.ligand_context_bond_index)
+        # mask = torch.logical_and(
+        #     (data.ligand_bond_index[0].view(-1, 1) == context_idx.view(1, -1)).any(dim=-1),
+        #     (data.ligand_bond_index[1].view(-1, 1) == context_idx.view(1, -1)).any(dim=-1),
+        # )
+        # print(data.ligand_bond_index[:, mask])
+        # print(data.ligand_context_num_neighbors)
+        # print(data.ligand_num_neighbors[context_idx])
+        data.ligand_frontier = data.ligand_context_num_neighbors < data.ligand_num_neighbors[context_idx]
+        data._mask = 'random'
+        return data
+class LigandBFSMask(object):
+    def __init__(self, min_ratio=0.0, max_ratio=1.2, min_num_masked=1, min_num_unmasked=0, vocab=None):
+        super().__init__()
+        self.min_ratio = min_ratio
+        self.max_ratio = max_ratio
+        self.min_num_masked = min_num_masked
+        self.min_num_unmasked = min_num_unmasked
+        self.vocab = vocab
+        self.vocab_size = vocab.size()
+    @staticmethod
+    def get_bfs_perm_motif(moltree, vocab):
+        for i, node in enumerate(moltree.nodes):
+            node.nid = i
+            node.wid = vocab.get_index(node.smiles)
+        # num_motifs = len(moltree.nodes)
+        bfs_queue = [0]
+        bfs_perm = []
+        bfs_focal = []
+        visited = {bfs_queue[0]}
+        while len(bfs_queue) > 0:
+            current = bfs_queue.pop(0)
+            bfs_perm.append(current)
+            next_candid = []
+            for motif in moltree.nodes[current].neighbors:
+                if motif.nid in visited: continue
+                next_candid.append(motif.nid)
+                visited.add(motif.nid)
+                bfs_focal.append(current)
+            random.shuffle(next_candid)
+            bfs_queue += next_candid
+        return bfs_perm, bfs_focal
+    def __call__(self, data):
+        bfs_perm, bfs_focal = self.get_bfs_perm_motif(data['moltree'], self.vocab)
+        ratio = np.clip(random.uniform(self.min_ratio, self.max_ratio), 0.0, 1.0)
+        num_motifs = len(bfs_perm)
+        num_masked = int(num_motifs * ratio)
+        if num_masked < self.min_num_masked:
+            num_masked = self.min_num_masked
+        if (num_motifs - num_masked) < self.min_num_unmasked:
+            num_masked = num_motifs - self.min_num_unmasked
+        num_unmasked = num_motifs - num_masked
+        context_motif_ids = bfs_perm[:-num_masked]
+        context_idx = set()
+        for i in context_motif_ids:
+            context_idx = context_idx | set(data['moltree'].nodes[i].clique)
+        context_idx = torch.LongTensor(list(context_idx))
+        if num_masked == num_motifs:
+            data['current_wid'] = torch.tensor([self.vocab_size])
+            data['current_atoms'] = torch.tensor([data['protein_contact_idx']])
+            data['next_wid'] = torch.tensor([data['moltree'].nodes[bfs_perm[-num_masked]].wid])
+        else:
+            data['current_wid'] = torch.tensor([data['moltree'].nodes[bfs_focal[-num_masked]].wid])
+            data['next_wid'] = torch.tensor([data['moltree'].nodes[bfs_perm[-num_masked]].wid])  # For Prediction
+            current_atoms = data['moltree'].nodes[bfs_focal[-num_masked]].clique
+            data['current_atoms'] = torch.cat([torch.where(context_idx == i)[0] for i in current_atoms]) + len(data['protein_pos'])
+        data['ligand_context_element'] = data['ligand_element'][context_idx]
+        data['ligand_context_feature_full'] = data['ligand_atom_feature_full'][context_idx]  # For Input
+        data['ligand_context_pos'] = data['ligand_pos'][context_idx]
+        data['ligand_center'] = torch.mean(data['ligand_pos'], dim=0)
+        data['num_atoms'] = torch.tensor([len(context_idx) + len(data['protein_pos'])])
+        # distance matrix prediction
+        if len(data['ligand_context_pos']) > 0:
+            sample_idx = random.sample(data['moltree'].nodes[bfs_perm[0]].clique, 2)
+            data['dm_ligand_idx'] = torch.cat([torch.where(context_idx == i)[0] for i in sample_idx])
+            data['dm_protein_idx'] = torch.sort(torch.norm(data['protein_pos'] - data['ligand_context_pos'][data['dm_ligand_idx'][0]], dim=-1)).indices[:4]
+            data['true_dm'] = torch.norm(data['protein_pos'][data['dm_protein_idx']].unsqueeze(1) - data['ligand_context_pos'][data['dm_ligand_idx']].unsqueeze(0), dim=-1).reshape(-1)
+        else:
+            data['true_dm'] = torch.tensor([])
+        data['protein_alpha_carbon_index'] = torch.tensor([i for i, name in enumerate(data['protein_atom_name']) if name =="CA"])
+        data['alpha_carbon_indicator'] = torch.tensor([True if name =="CA" else False for name in data['protein_atom_name']])
+        # assemble prediction
+        data['protein_contact'] = torch.tensor(data['protein_contact'])
+        if len(context_motif_ids) > 0:
+            cand_labels, cand_mols = enumerate_assemble(data['moltree'].mol, context_idx.tolist(),
+                                                        data['moltree'].nodes[bfs_focal[-num_masked]],
+                                                        data['moltree'].nodes[bfs_perm[-num_masked]])
+            data['cand_labels'] = cand_labels
+            data['cand_mols'] = [mol_to_graph_data_obj_simple(mol) for mol in cand_mols]
+        else:
+            data['cand_labels'], data['cand_mols'] = torch.tensor([]), []
+        data['ligand_context_bond_index'], data['ligand_context_bond_feature'] = subgraph(
+            context_idx,
+            data['ligand_bond_index'],
+            edge_attr=data['ligand_bond_feature'],
+            relabel_nodes=True,
+        )
+        data['ligand_context_num_neighbors'] = LigandCountNeighbors.count_neighbors(
+            data['ligand_context_bond_index'],
+            symmetry=True,
+            num_nodes=context_idx.size(0),
+        )
+        data['ligand_frontier'] = data['ligand_context_num_neighbors'] < data['ligand_num_neighbors'][context_idx]
+        data['_mask'] = 'bfs'
+        # find a rotatable bond as the current motif
+        rotatable_ids = []
+        for i, id in enumerate(bfs_focal):
+            if data['moltree'].nodes[id].rotatable:
+                rotatable_ids.append(i)
+        if len(rotatable_ids) == 0:
+            # assign empty tensor
+            data['ligand_torsion_xy_index'] = torch.tensor([])
+            data['dihedral_mask'] = torch.tensor([]).bool()
+            data['ligand_element_torsion'] = torch.tensor([])
+            data['ligand_pos_torsion'] = torch.tensor([])
+            data['ligand_feature_torsion'] = torch.tensor([])
+            data['true_sin'], data['true_cos'], data['true_three_hop'] = torch.tensor([]), torch.tensor([]), torch.tensor([])
+            data['xn_pos'], data['yn_pos'], data['y_pos'] = torch.tensor([]), torch.tensor([]), torch.tensor([])
+        else:
+            num_unmasked = random.sample(rotatable_ids, 1)[0]
+            current_idx = torch.LongTensor(data['moltree'].nodes[bfs_focal[num_unmasked]].clique)
+            next_idx = torch.LongTensor(data['moltree'].nodes[bfs_perm[num_unmasked + 1]].clique)
+            current_idx_set = set(data['moltree'].nodes[bfs_focal[num_unmasked]].clique)
+            next_idx_set = set(data['moltree'].nodes[bfs_perm[num_unmasked + 1]].clique)
+            all_idx = set()
+            for i in bfs_perm[:num_unmasked + 2]:
+                all_idx = all_idx | set(data['moltree'].nodes[i].clique)
+            all_idx = list(all_idx)
+            x_id = current_idx_set.intersection(next_idx_set).pop()
+            y_id = (current_idx_set - {x_id}).pop()
+            data['ligand_torsion_xy_index'] = torch.cat([torch.where(torch.LongTensor(all_idx) == i)[0] for i in [x_id, y_id]])
+            x_pos, y_pos = deepcopy(data['ligand_pos'][x_id]), deepcopy(data['ligand_pos'][y_id])
+            # remove x, y, and non-generated elements
+            xn, yn = deepcopy(data['ligand_neighbors'][x_id]), deepcopy(data['ligand_neighbors'][y_id])
+            xn.remove(y_id)
+            yn.remove(x_id)
+            xn, yn = xn[:3], yn[:3]
+            # debug
+            xn, yn = list_filter(xn, all_idx), list_filter(yn, all_idx)
+            xn_pos, yn_pos = torch.zeros(3, 3), torch.zeros(3, 3)
+            xn_pos[:len(xn)], yn_pos[:len(yn)] = deepcopy(data['ligand_pos'][xn]), deepcopy(data['ligand_pos'][yn])
+            xn_idx, yn_idx = torch.cartesian_prod(torch.arange(3), torch.arange(3)).chunk(2, dim=-1)
+            xn_idx = xn_idx.squeeze(-1)
+            yn_idx = yn_idx.squeeze(-1)
+            dihedral_x, dihedral_y = torch.zeros(3), torch.zeros(3)
+            dihedral_x[:len(xn)] = 1
+            dihedral_y[:len(yn)] = 1
+            data['dihedral_mask'] = torch.matmul(dihedral_x.view(3, 1), dihedral_y.view(1, 3)).view(-1).bool()
+            data['true_sin'], data['true_cos'] = batch_dihedrals(xn_pos[xn_idx], x_pos.repeat(9, 1), y_pos.repeat(9, 1),
+                                                           yn_pos[yn_idx])
+            data['true_three_hop'] = torch.linalg.norm(xn_pos[xn_idx] - yn_pos[yn_idx], dim=-1)[data['dihedral_mask']]
+            # random rotate to simulate the inference situation
+            dir = data['ligand_pos'][current_idx[0]] - data['ligand_pos'][current_idx[1]]
+            ref = deepcopy(data['ligand_pos'][current_idx[0]])
+            next_motif_pos = deepcopy(data['ligand_pos'][next_idx])
+            data['ligand_pos'][next_idx] = rand_rotate(dir, ref, next_motif_pos)
+            data['ligand_element_torsion'] = data['ligand_element'][all_idx]
+            data['ligand_pos_torsion'] = data['ligand_pos'][all_idx]
+            data['ligand_feature_torsion'] = data['ligand_atom_feature_full'][all_idx]
+            x_pos = deepcopy(data['ligand_pos'][x_id])
+            data['y_pos'] = data['ligand_pos'][y_id] - x_pos
+            data['xn_pos'], data['yn_pos'] = torch.zeros(3, 3), torch.zeros(3, 3)
+            data['xn_pos'][:len(xn)], data['yn_pos'][:len(yn)] = data['ligand_pos'][xn] - x_pos, data['ligand_pos'][yn] - x_pos
+        return data
+class LigandMaskAll(LigandBFSMask):
+    def __init__(self, vocab):
+        super().__init__(min_ratio=1.0, vocab=vocab)
+class LigandMixedMask(object):
+    def __init__(self, min_ratio=0.0, max_ratio=1.2, min_num_masked=1, min_num_unmasked=0, p_random=0.5, p_bfs=0.25,
+                 p_invbfs=0.25):
+        super().__init__()
+        self.t = [
+            LigandRandomMask(min_ratio, max_ratio, min_num_masked, min_num_unmasked),
+            LigandBFSMask(min_ratio, max_ratio, min_num_masked, min_num_unmasked, inverse=False),
+            LigandBFSMask(min_ratio, max_ratio, min_num_masked, min_num_unmasked, inverse=True),
+        ]
+        self.p = [p_random, p_bfs, p_invbfs]
+    def __call__(self, data):
+        f = random.choices(self.t, k=1, weights=self.p)[0]
+        return f(data)
+def get_mask(cfg, vocab):
+    if cfg.type == 'bfs':
+        return LigandBFSMask(
+            min_ratio=cfg.min_ratio,
+            max_ratio=cfg.max_ratio,
+            min_num_masked=cfg.min_num_masked,
+            min_num_unmasked=cfg.min_num_unmasked,
+            vocab=vocab
+        )
+    elif cfg.type == 'random':
+        return LigandRandomMask(
+            min_ratio=cfg.min_ratio,
+            max_ratio=cfg.max_ratio,
+            min_num_masked=cfg.min_num_masked,
+            min_num_unmasked=cfg.min_num_unmasked,
+        )
+    elif cfg.type == 'mixed':
+        return LigandMixedMask(
+            min_ratio=cfg.min_ratio,
+            max_ratio=cfg.max_ratio,
+            min_num_masked=cfg.min_num_masked,
+            min_num_unmasked=cfg.min_num_unmasked,
+            p_random=cfg.p_random,
+            p_bfs=cfg.p_bfs,
+            p_invbfs=cfg.p_invbfs,
+        )
+    elif cfg.type == 'all':
+        return LigandMaskAll()
+    else:
+        raise NotImplementedError('Unknown mask: %s' % cfg.type)
+def kabsch(A, B):
+    # Input:
+    #     Nominal  A Nx3 matrix of points
+    #     Measured B Nx3 matrix of points
+    # Returns R,t
+    # R = 3x3 rotation matrix (B to A)
+    # t = 3x1 translation vector (B to A)
+    assert len(A) == len(B)
+    N = A.shape[0]  # total points
+    centroid_A = np.mean(A, axis=0)
+    centroid_B = np.mean(B, axis=0)
+    # center the points
+    AA = A - np.tile(centroid_A, (N, 1))
+    BB = B - np.tile(centroid_B, (N, 1))
+    H = np.transpose(BB) * AA
+    U, S, Vt = np.linalg.svd(H)
+    R = Vt.T * U.T
+    # special reflection case
+    if np.linalg.det(R) < 0:
+        Vt[2, :] *= -1
+        R = Vt.T * U.T
+    t = -R * centroid_B.T + centroid_A.T
+    return R, t

utils/warmup.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+MIT License
+Copyright (c) 2019 Ildoo Kim
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.lr_scheduler import ReduceLROnPlateau
+class GradualWarmupScheduler(_LRScheduler):
+    """ Gradually warm-up(increasing) learning rate in optimizer.
+    Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'.
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        multiplier: target learning rate = base lr * multiplier if multiplier > 1.0. if multiplier = 1.0, lr starts from 0 and ends up with the base_lr.
+        total_epoch: target learning rate is reached at total_epoch, gradually
+        after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
+    """
+    def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
+        self.multiplier = multiplier
+        if self.multiplier < 1.:
+            raise ValueError('multiplier should be greater thant or equal to 1.')
+        self.total_epoch = total_epoch
+        self.after_scheduler = after_scheduler
+        self.finished = False
+        super(GradualWarmupScheduler, self).__init__(optimizer)
+    def get_lr(self):
+        if self.last_epoch > self.total_epoch:
+            if self.after_scheduler:
+                if not self.finished:
+                    self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
+                    self.finished = True
+                return self.after_scheduler.get_last_lr()
+            return [base_lr * self.multiplier for base_lr in self.base_lrs]
+        if self.multiplier == 1.0:
+            return [base_lr * (float(self.last_epoch) / self.total_epoch) for base_lr in self.base_lrs]
+        else:
+            return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
+    def step_ReduceLROnPlateau(self, metrics, epoch=None):
+        if epoch is None:
+            epoch = self.last_epoch + 1
+        self.last_epoch = epoch if epoch != 0 else 1  # ReduceLROnPlateau is called at the end of epoch, whereas others are called at beginning
+        if self.last_epoch <= self.total_epoch:
+            warmup_lr = [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
+            for param_group, lr in zip(self.optimizer.param_groups, warmup_lr):
+                param_group['lr'] = lr
+        else:
+            if epoch is None:
+                self.after_scheduler.step(metrics, None)
+            else:
+                self.after_scheduler.step(metrics, epoch - self.total_epoch)
+    def step(self, epoch=None, metrics=None):
+        if type(self.after_scheduler) != ReduceLROnPlateau:
+            if self.finished and self.after_scheduler:
+                if epoch is None:
+                    self.after_scheduler.step(None)
+                else:
+                    self.after_scheduler.step(epoch - self.total_epoch)
+                self._last_lr = self.after_scheduler.get_last_lr()
+            else:
+                return super(GradualWarmupScheduler, self).step(epoch)
+        else:
+            self.step_ReduceLROnPlateau(metrics, epoch)

vocab.txt ADDED Viewed

	@@ -0,0 +1,549 @@

+CC:108150
+CN:59667
+CO:39300
+C=O:36148
+C1=CC=CC=C1:17649
+OP:7954
+O=S:5180
+CF:4607
+CS:4246
+C[NH3+]:3561
+O=P:3006
+CCl:2484
+C[NH+]:2321
+C=N:2305
+C1CCNC1:2115
+[H]N:2073
+C1CCOCC1:1957
+C1=CC=NC=C1:1892
+C1=CN=CN=C1:1875
+NS:1824
+C1CCOC1:1578
+C[NH2+]:1291
+C1CCCCC1:1209
+C=C:1202
+C1=CNC=N1:1184
+C[N+]:1066
+C1=CNN=C1:676
+C1CCNCC1:670
+CP:662
+C1=CCCC=C1:628
+OS:624
+C1=CSC=C1:614
+C1CCCC1:531
+C#N:481
+NO:477
+C1=CSC=N1:463
+CBr:437
+C1=CNCNC1:436
+C1CC1:434
+C1=CCCCC1:423
+C1=CNC=C1:391
+C1CC[NH+]CC1:352
+C1=CCNC=C1:319
+[N+]=O:315
+C1=CN=CNC1:313
+C1=NCCN1:311
+[N+][O-]:310
+C1=CNCC1:300
+C1CC[NH2+]CC1:274
+C1CC[NH2+]C1:264
+BO:260
+C1C[NH+]CCN1:247
+C1=CNN=N1:226
+C1=COC=C1:218
+C1CC[NH+]C1:210
+C#C:208
+C1COCCN1:206
+C1=CN=CC=N1:184
+C1=CCCN=C1:179
+C1=CCNCC1:176
+CI:175
+C1CNCN1:171
+C1CNCCN1:165
+C1=COCCC1:165
+C1=CON=C1:162
+C1COCC[NH+]1:138
+BC:137
+C1=CCNC1:131
+C1CNCNC1:129
+C1=CNCN=C1:127
+NP:124
+C1=NN=CN1:124
+C1=NC=NN1:123
+C1C[NH+]CC[NH+]1:118
+C=S:117
+C1=CCCC1:112
+C1=NCCCN1:107
+C1=COCC1:107
+C1=NCCS1:102
+C1=NNCC1:100
+C1CSCN1:98
+NN:95
+C1=CNCCC1:94
+C1=CCNN=C1:90
+C1=COC=N1:81
+C1COCO1:78
+C1=COCO1:78
+C1=CCOCC1:75
+C1=NC=NCC1:73
+C1=CNCC=N1:72
+C1=NN=CS1:70
+C1CCSC1:66
+C1CCC1:65
+C1C[NH2+]CCN1:63
+C1=CSCC1:62
+C1=COC=CC1:62
+C1=NC=NC=N1:60
+C1COCN1:59
+N=N:57
+C1=CNCN1:55
+C1=CNCCN1:55
+C1=CNC=NC1:54
+C1CCCNCC1:51
+C1=CN=CCC1:51
+C1=CC1:51
+C1C2CC3CC1CC(C2)C3:50
+C1=CCC=CC1:50
+C:N:49
+C1=CCOC=C1:48
+[NH2+]O:47
+C1=NCNCC1:46
+C1CNC1:44
+C1CNCCNC1:42
+C1=CNC=CC1:40
+C1CO1:39
+C1=CC[NH+]CC1:39
+N=O:37
+C1=CSCN1:35
+[NH+]N:34
+C1=NC=NO1:34
+C1=CNCCNC1:32
+C1=CC[NH2+]CC1:30
+[N+]=[N-]:29
+[N+]=N:29
+O[PH]:29
+C1CNCSC1:29
+C1CCCCCC1:29
+C1=NN=NN1:27
+C1COPOC1:26
+C1CCSCC1:26
+C1=NNCCC1:26
+C1=CCCCCC1:26
+C1C[Fe]1:25
+C1=NN=CO1:25
+C1=NCCN=C1:25
+C1=CSC=[N+]1:25
+PS:24
+C1CNCOC1:24
+C1CCC[NH2+]CC1:24
+C1=NCCO1:24
+C1=COCCN1:24
+C1=CNC=[N+]1:24
+C1=CCN=CC1:24
+C[PH]:23
+C1COCOC1:23
+C1CNCC[NH2+]C1:23
+C1=CONC1:23
+C1CNCCOC1:22
+C1=NON=C1:21
+C1=NNCN1:21
+O:S:20
+C[Si]:20
+C1C[NH2+]CC[NH+]1:19
+C1C[NH+]C1:19
+C1=[N+]CCC1:19
+C1=NOCC1:19
+C1NO1:18
+C1=CNCCN=C1:18
+SS:17
+C1CSC[NH+]1:17
+C1COCCO1:17
+C1=CCOC1:17
+C[Se]:16
+C[AsH]:16
+C1CSC[NH2+]1:16
+C1CSCCN1:16
+C1CCCCCCC1:16
+C1=NC=NCN1:16
+C1=CSN=C1:16
+C1=CSCCC1:16
+C1=COCCO1:16
+C1=CCSC1:16
+C1=CCN=C1:16
+C1COC1:15
+C1=NCCNC1:15
+C1=NC=NNC1:15
+C1=C[Fe]1:15
+C1=CSNCC1:15
+C1=CN[C@H]2CCCC(N1)O2:15
+C1=CC=[N+]C=C1:15
+[NH+]O:14
+C=[N+]:14
+C1CCNCNC1:14
+C1=NNN=C1:14
+C1=CNNC1:14
+C1=CN=NC=C1:14
+C1=CC=NN=C1:14
+C1C[C@@H]2CC[C@H](C1)[NH+]2:13
+C1CNN=N1:13
+C1=NCCCC1:13
+C1C[Ru]1:12
+C1=NNN=N1:12
+[N+]N:11
+C1CNSC1:11
+C1CC[NH2+]NC1:11
+C1CC2NCCN[C@@H](C1)O2:11
+C1=NCNC1:11
+C1=NC=NC1:11
+C1=CCCCC=C1:11
+C1=CCC=C1:11
+O=[SH]:10
+C1CSNCN1:10
+C1=CSCCN1:10
+C1=CC=CCC=C1:10
+C1C[NH2+]C1:9
+C1C[NH+]2CCC1CC2:9
+C1CC[SH]C1:9
+C1=[N+]CCN1:9
+C1=CSNC1:9
+C1CN[Fe]NC1:8
+C1CNCC[NH+]C1:8
+C1=NNCCN1:8
+C1=CSN=N1:8
+C1=CNCCCN1:8
+C1=CNC=CN1:8
+C1=CC[NH+]C1:8
+C1=CCSCC1:8
+C1=CCNNC1:8
+C1NCON1:7
+C1COCC[NH2+]1:7
+C1CNSN1:7
+C1CNNC1:7
+C1CCC[NH+]CC1:7
+C1=[N+]O[Fe]O1:7
+C1=C[Ru]1:7
+C1=C[N+][Co][N+]=C1:7
+C1=CNN=CC1:7
+[O-]P:6
+O:P:6
+N[NH3+]:6
+N[NH2+]:6
+FS:6
+C[SeH]:6
+C1NO[Fe]O1:6
+C1C[C@@H]2CC[C@H](C1)N2:6
+C1CSCCO1:6
+C1CC[N+]C1:6
+C1=NSN=C1:6
+C1=NCCC1:6
+C1=COCN1:6
+C1=CNNCC1:6
+C1=CNCSC1:6
+C1=CNC=CNC1:6
+C1=CN=NC1:6
+C1=CC[N+]=C1:6
+[N+]O:5
+P=S:5
+C1[NH+]O1:5
+C1C[Rh]1:5
+C1C[C@@H]2CC[C@H](C1)[N+]2:5
+C1C[C@@H]2CCNC[C@H](C1)N2:5
+C1CN[NH2+]C1:5
+C1CC[SH]CC1:5
+C1CCNSCC1:5
+C1CCNNC1:5
+C1=N[N+]=CS1:5
+C1=NSCCN1:5
+C1=NCCNCC1:5
+C1=NC=NS1:5
+C1=C\CCCCCC/1:5
+C1=C[Fe]C1:5
+C1=CSNCN1:5
+C1=COCCCN1:5
+C1=CN[Co][N+]=C1:5
+C1=CNCCCC1:5
+C1=CN=NC=N1:5
+C1=CCCNCC1:5
+[NH3+]O:4
+[NH+][NH2+]:4
+O[Si]:4
+C1N[NH2+]CS1:4
+C1NCN[Fe]N1:4
+C1NCNN1:4
+C1C[NH2+]1:4
+C1C[NH+]CC[NH+]C1:4
+C1C[C@H]2CC[C@@H]1C2:4
+C1C[C@@H]2CC[C@H](C1)[NH2+]2:4
+C1COCC[N+]1:4
+C1CNOC1:4
+C1CCO[NH2+]C1:4
+C1CC2CCC1C2:4
+C1=[N+]NCC1:4
+C1=NNCNC1:4
+C1=NCCSN1:4
+C1=C[N+][Mg][N+]=C1:4
+C1=CSNC=N1:4
+C1=CN=NCC1:4
+C1=CCNCCC1:4
+C1=CCCOCC1:4
+C1=CCCOC=C1:4
+C1=CCC1:4
+C1=C2C[N@@H+]3CC[C@]45CCN6CC[C@H](OC1)[C@@H]([C@H]64)[C@H]2C[C@H]35:4
+B1CCC=CO1:4
+O=[V]:3
+O=[Sb]:3
+N=S:3
+C1NC[C@H]2CNC[C@@H]1C2:3
+C1C[C@@H]2CC[C@H](C1)C2:3
+C1COSO1:3
+C1COC[NH2+]1:3
+C1CN[NH+]C1:3
+C1CN[Co][N+]1:3
+C1CNCC[N+]1:3
+C1CNCCSC1:3
+C1CN=[N+]C1:3
+C1CC[NH+]NC1:3
+C1CC[NH+]CCNC1:3
+C1CCSNC1:3
+C1CCNSNC1:3
+C1CC2C[NH+][C@@H]3C4CCC[C@]3(C1)[C@@H]2C4:3
+C1CC2CCC1CC2:3
+C1=[N+]CCS1:3
+C1=[N+]CCCN1:3
+C1=[N+]CCCCC1:3
+C1=NSCC1:3
+C1=NNCSC1:3
+C1=NNCS1:3
+C1=NCNN=C1:3
+C1=NCCCCC1:3
+C1=C[Ru]C1:3
+C1=C[Rh]1:3
+C1=C[N+]=CC1:3
+C1=C[C@H]2COCC(C1)C2:3
+C1=C[C@H]2CC[C@@H]1N2:3
+C1=C[C@H]2CC[C@@H]1C2:3
+C1=C[C@@H]2CC=C[C@H](C1)C2:3
+C1=COCOC1:3
+C1=COCCNC1:3
+C1=CNSCCN1:3
+C1=CNSC=C1:3
+C1=CNSC1:3
+C1=CNC=CCC1:3
+C1=CCCCN=C1:3
+C1=CCCCCCC1:3
+C1=CCCC=CC1:3
+C1=CC2CCO[C@@H](C1)C2:3
+B1C=CCO1:3
+[O-]S:2
+OO:2
+C1OCO1:2
+C1C[NH2+]C[NH2+]C1:2
+C1C[NH2+]CN1:2
+C1C[NH+][NH2+]C1:2
+C1C[C@H]2C[C@H](CCO2)N1:2
+C1C[C@H]2COC[C@@H]1C2:2
+C1C[C@@H]2C[C@H]1CN2:2
+C1C[C@@H]2CNC[C@H](C1)N2:2
+C1CSN=N1:2
+C1CNC[NH2+]C1:2
+C1CN1:2
+C1CC[NH2+][NH2+]C1:2
+C1CC[N+]CC1:2
+C1CCSCNC1:2
+C1CCCNCCC1:2
+C1CC2OC[C@@H](C1)O2:2
+C1CC2COC(C1)C2:2
+C1CC2CC[C@H](C1)[NH2+]2:2
+C1CC2CCO[Fe](O1)OC2:2
+C1CC2CCC(O1)O2:2
+C1CC2CCC(C1)C2:2
+C1=[N+]CNN1:2
+C1=N[N+]=CCC1:2
+C1=NSNC1:2
+C1=NNCO1:2
+C1=NN=CCC1:2
+C1=NN=CC1:2
+C1=NCSCC1:2
+C1=NCC=[N+]1:2
+C1=NC=NN=C1:2
+C1=C[N+][Zn]NC1:2
+C1=C[N+]C=CC1:2
+C1=C[C@H]2C[NH2+]C[C@@H]1C2:2
+C1=C[C@H]2C[NH2+]C[C@@H](C1)[NH2+]2:2
+C1=C[C@H]2CC=CC(C1)C2:2
+C1=CSNCCO1:2
+C1=CSC=CN1:2
+C1=COSNC1:2
+C1=COCCCNC1:2
+C1=COCC=N1:2
+C1=CN[Zn][N+]=C1:2
+C1=CN[C@H]2CCC(N1)O2:2
+C1=CNSNC1:2
+C1=CNSCCC1:2
+C1=CNNC=C1:2
+C1=CNC[NH2+]C1:2
+C1=CNCCOC1:2
+C1=CNCCCNC1:2
+C1=CN=[N+]C1:2
+C1=CN=CNCC1:2
+C1=CN=CCN=C1:2
+C1=CN=CC1:2
+C1=CC[NH2+]NC1:2
+C1=CC[NH2+]C1:2
+C1=CC[NH+]NC1:2
+C1=CC[NH+]CCC1:2
+C1=CC[N+]CC1:2
+C1=CCOC=CC1:2
+C1=CCN=CCC1:2
+C1=CCCNC=C1:2
+C1=CC2CC(C1)CO2:2
+B1OCCO1:2
+B1CCCO1:2
+[N+][NH+]:1
+O[Fe]:1
+O1POPOPOP1:1
+N1OO1:1
+C[TeH]:1
+C[Sb]:1
+C[Ru]:1
+C[O-]:1
+C[AsH2]:1
+C1O[C@@H]2COC1O2:1
+C1N[C@@H]2CN[C@H]1C2:1
+C1NN=NN1:1
+C1NC[C@H]2C[NH2+]C[C@@H]1C2:1
+C1NC[C@H]2C[NH+]C[C@@H]1C2:1
+C1NC[C@H]2CC[C@H](C2)N1:1
+C1NC[C@@H]2OC[C@H]1O2:1
+C1NCO[NH2+]1:1
+C1NCNCN1:1
+C1NC2COC[C@H](C2)S1:1
+C1N=[N+]CS1:1
+C1N=NCO1:1
+C1N=N1:1
+C1C[NH2+][Pt][NH2+]1:1
+C1C[NH2+]OC1:1
+C1C[NH2+]C[NH+]1:1
+C1C[NH2+]CSC1:1
+C1C[NH2+]CC[NH2+]1:1
+C1C[NH2+]CCSC1:1
+C1C[NH2+]CCOC1:1
+C1C[NH+][NH2+]N1:1
+C1C[NH+]C[NH+]C1:1
+C1C[NH+]CSC1:1
+C1C[N+][Co][N+]1:1
+C1C[N+]CNC1:1
+C1C[C@H]2C[NH+]C[C@@H]1N2:1
+C1C[C@H]2C[C@@H]1CN2:1
+C1C[C@H]2CC[C@@H]1O2:1
+C1C[C@H]2CCC[C@@H](C1)[NH+]2:1
+C1C[C@@H]2C[C@@H](CCO2)N1:1
+C1C[C@@H]2CO[C@H](C1)N2:1
+C1C[C@@H]2CC[C@H](C1)O2:1
+C1CSCS1:1
+C1CSCN[NH2+]1:1
+C1CSCC[NH+]1:1
+C1CSC1:1
+C1CO[V]O1:1
+C1CO[SH]C1:1
+C1COSN1:1
+C1COPO1:1
+C1COCCOC1:1
+C1CN[C@H]2CCC(N1)O2:1
+C1CNSCCN1:1
+C1CNNCN1:1
+C1CNCN[NH+]C1:1
+C1CN=NC1:1
+C1CCSSC1:1
+C1CCONC1:1
+C1CCOCOC1:1
+C1CCOCCOC1:1
+C1CCNNCC1:1
+C1CC2CO[C@@H](C2)[NH+]1:1
+C1CC2COC(C1)OO2:1
+C1CC2CC[NH+][C@H](C1)C2:1
+C1CC2CC[C@H](C1)C2:1
+C1CC2CCC1C[NH+]2:1
+C1CC2CC1CO2:1
+C1C2C[C@@H]3C[C@H](C2)C1O3:1
+C1C2CC1C2:1
+C1=[N+]CON1:1
+C1=[N+]CCOC1:1
+C1=[N+]CCNC1:1
+C1=NSN=CC1:1
+C1=NO[N+]=C1:1
+C1=NOCN1:1
+C1=NNNC1:1
+C1=NNC=[N+]1:1
+C1=NN=[N+]C1:1
+C1=NC[NH2+]CC1:1
+C1=NC[NH2+]C1:1
+C1=NCON1:1
+C1=NCOC1:1
+C1=NCNN1:1
+C1=NCNCN1:1
+C1=NCN=CN1:1
+C1=NCC[N+]=C1:1
+C1=NCCC=[N+]1:1
+C1=C\CCOCCC/1:1
+C1=C[Se]C=N1:1
+C1=C[SH]CCCN1:1
+C1=C[Ru]NC1:1
+C1=C[Rh]C1:1
+C1=C[N+]CC1:1
+C1=C[N+]C=NC1:1
+C1=C[N+]=CNC1:1
+C1=C[C@H]2C[C@H](CCO2)N1:1
+C1=C[C@H]2C[C@H](C2)NC1:1
+C1=C[C@H]2CC[C@@H]1O2:1
+C1=C[C@H]2CCC[C@@H]1[N+]2:1
+C1=C[C@H]2CCCC1[NH2+]2:1
+C1=C[C@@H]2C[NH2+]C[C@H](C1)[NH2+]2:1
+C1=C[C@@H]2CN(C1)CN2:1
+C1=C[C@@H]2CCOC(C1)O2:1
+C1=C[C@@H]2CC=CC(C1)C2:1
+C1=CSOCNC1:1
+C1=CSOC1:1
+C1=CSCO1:1
+C1=CSCCCN1:1
+C1=CSCCC=N1:1
+C1=CSC=CC1:1
+C1=CPNCN1:1
+C1=CO[NH2+]C1:1
+C1=COCNC1:1
+C1=COCC[N+]1:1
+C1=COCCCO1:1
+C1=COCCCC1:1
+C1=COC=CN1:1
+C1=CN[NH2+]N1:1
+C1=CN[NH+]C1:1
+C1=CN[C@@H]2CC[C@H](N1)O2:1
+C1=CNOC1:1
+C1=CNCOC1:1
+C1=CNCNN=C1:1
+C1=CNCN=N1:1
+C1=CNCCC=N1:1
+C1=CNCC=[N+]1:1
+C1=CNC=[N+]C1:1
+C1=CN=CCSC1:1
+C1=CN=CCCC1:1
+C1=CC[NH2+]CCC1:1
+C1=CC[N+]C=C1:1
+C1=CCOCCC1:1
+C1=CCNCC=C1:1
+C1=CCN=NC1:1
+C1=CCC[NH+]CC1:1
+C1=CCCSC=C1:1
+C1=CC=NCC=C1:1
+C1=CC=COC=C1:1
+C1=CC=CNC=C1:1
+C1=CC2CC[NH+][C@@H](C1)C2:1
+C1=CC2CCCC(C1)C2:1
+C1=CC2CCC(C1)O2:1
+C1=CC2CC=C[C@H](C2)OC1:1
+C1=CC2CC(CN2)[NH2+]C1:1
+C1=CC2C3C[NH2+][C@H]2CC1C3:1
+C1#CCCCCCC1:1
+B1OCCCO1:1
+B1CCCCO1:1