Spaces:

saicharan2804
/

molgen_metric

Runtime error

App Files Files Community

saicharan2804 commited on Mar 6

Commit

36173e1

•

1 Parent(s): af1e58a

Added manual implementation of metrics

Browse files

Files changed (5) hide show

fcd.py +102 -0
metrics.py +344 -0
molgen_metric.py +0 -2
utils.py +316 -0
utils2.py +271 -0

fcd.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from fcd_torch.utils import SmilesDataset, \
+                            calculate_frechet_distance, \
+                            todevice, \
+                            load_imported_model
+import torch
+from torch.utils.data import DataLoader
+import os
+import numpy as np
+import warnings
+class FCD:
+    """
+    Computes Frechet ChemNet Distance on PyTorch.
+    * You can precalculate mean and sigma for further usage,
+      e.g. if you use the statistics from the same dataset
+      multiple times.
+    * Supports GPU and selection of GPU index
+    * Multithread SMILES parsing
+    Example 1:
+        fcd = FCD(device='cuda:0', n_jobs=8)
+        smiles_list = ['CCC', 'CCNC']
+        fcd(smiles_list, smiles_list)
+    Example 2:
+        fcd = FCD(device='cuda:0', n_jobs=8)
+        smiles_list = ['CCC', 'CCNC']
+        pgen = fcd.precalc(smiles_list)
+        fcd(smiles_list, pgen=pgen)
+    """
+    def __init__(self, device='cpu', n_jobs=1,
+                 batch_size=512,
+                 model_path=None,
+                 canonize=True):
+        """
+        Loads ChemNet on device
+        params:
+            device: cpu for CPU, cuda:0 for GPU 0, etc.
+            n_jobs: number of workers to parse SMILES
+            batch_size: batch size for processing SMILES
+            model_path: path to ChemNet_v0.13_pretrained.pt
+        """
+        if model_path is None:
+            model_dir = os.path.split(__file__)[0]
+            model_path = os.path.join(model_dir, 'ChemNet_v0.13_pretrained.pt')
+        self.device = device
+        self.n_jobs = n_jobs if n_jobs != 1 else 0
+        self.batch_size = batch_size
+        keras_config = torch.load(model_path)
+        self.model = load_imported_model(keras_config)
+        self.model.eval()
+        self.canonize = canonize
+    def get_predictions(self, smiles_list):
+        if len(smiles_list) == 0:
+            return np.zeros((0, 512))
+        dataloader = DataLoader(
+            SmilesDataset(smiles_list, canonize=self.canonize),
+            batch_size=self.batch_size,
+            num_workers=self.n_jobs
+        )
+        with todevice(self.model, self.device), torch.no_grad():
+            chemnet_activations = []
+            for batch in dataloader:
+                chemnet_activations.append(
+                    self.model(
+                        batch.transpose(1, 2).float().to(self.device)
+                    ).to('cpu').detach().numpy()
+                )
+        return np.row_stack(chemnet_activations)
+    def precalc(self, smiles_list):
+        if len(smiles_list) < 2:
+            warnings.warn("Can't compute FCD for less than 2 molecules"
+                          "({} given)".format(len(smiles_list)))
+            return {}
+        chemnet_activations = self.get_predictions(smiles_list)
+        mu = chemnet_activations.mean(0)
+        sigma = np.cov(chemnet_activations.T)
+        return {'mu': mu, 'sigma': sigma}
+    def metric(self, pref, pgen):
+        if 'mu' not in pref or 'sigma' not in pgen:
+            warnings.warn("Failed to compute FCD (check ref)")
+            return np.nan
+        if 'mu' not in pgen or 'sigma' not in pgen:
+            warnings.warn("Failed to compute FCD (check gen)")
+            return np.nan
+        return calculate_frechet_distance(
+            pref['mu'], pref['sigma'], pgen['mu'], pgen['sigma']
+        )
+    def __call__(self, ref=None, gen=None, pref=None, pgen=None):
+        assert (ref is None) != (pref is None), "specify ref xor pref"
+        assert (gen is None) != (pgen is None), "specify gen xor pgen"
+        if pref is None:
+            pref = self.precalc(ref)
+        if pgen is None:
+            pgen = self.precalc(gen)
+        return self.metric(pref, pgen)

metrics.py ADDED Viewed

	@@ -0,0 +1,344 @@

+import warnings
+from multiprocessing import Pool
+import numpy as np
+from scipy.spatial.distance import cosine as cos_distance
+from fcd_torch import FCD as FCDMetric
+from scipy.stats import wasserstein_distance
+from moses.dataset import get_dataset, get_statistics
+from moses.utils import mapper
+from moses.utils import disable_rdkit_log, enable_rdkit_log
+from .utils import compute_fragments, average_agg_tanimoto, \
+    compute_scaffolds, fingerprints, \
+    get_mol, canonic_smiles, mol_passes_filters, \
+    logP, QED, SA, weight
+def get_all_metrics(gen, k=None, n_jobs=1,
+                    device='cpu', batch_size=512, pool=None,
+                    test=None, test_scaffolds=None,
+                    ptest=None, ptest_scaffolds=None,
+                    train=None):
+    """
+    Computes all available metrics between test (scaffold test)
+    and generated sets of SMILES.
+    Parameters:
+        gen: list of generated SMILES
+        k: int or list with values for unique@k. Will calculate number of
+            unique molecules in the first k molecules. Default [1000, 10000]
+        n_jobs: number of workers for parallel processing
+        device: 'cpu' or 'cuda:n', where n is GPU device number
+        batch_size: batch size for FCD metric
+        pool: optional multiprocessing pool to use for parallelization
+        test (None or list): test SMILES. If None, will load
+            a default test set
+        test_scaffolds (None or list): scaffold test SMILES. If None, will
+            load a default scaffold test set
+        ptest (None or dict): precalculated statistics of the test set. If
+            None, will load default test statistics. If you specified a custom
+            test set, default test statistics will be ignored
+        ptest_scaffolds (None or dict): precalculated statistics of the
+            scaffold test set If None, will load default scaffold test
+            statistics. If you specified a custom test set, default test
+            statistics will be ignored
+        train (None or list): train SMILES. If None, will load a default
+            train set
+    Available metrics:
+        * %valid
+        * %unique@k
+        * Frechet ChemNet Distance (FCD)
+        * Fragment similarity (Frag)
+        * Scaffold similarity (Scaf)
+        * Similarity to nearest neighbour (SNN)
+        * Internal diversity (IntDiv)
+        * Internal diversity 2: using square root of mean squared
+            Tanimoto similarity (IntDiv2)
+        * %passes filters (Filters)
+        * Distribution difference for logP, SA, QED, weight
+        * Novelty (molecules not present in train)
+    """
+    if test is None:
+        if ptest is not None:
+            raise ValueError(
+                "You cannot specify custom test "
+                "statistics for default test set")
+        test = get_dataset('test')
+        ptest = get_statistics('test')
+    if test_scaffolds is None:
+        if ptest_scaffolds is not None:
+            raise ValueError(
+                "You cannot specify custom scaffold test "
+                "statistics for default scaffold test set")
+        test_scaffolds = get_dataset('test_scaffolds')
+        ptest_scaffolds = get_statistics('test_scaffolds')
+    train = train or get_dataset('train')
+    if k is None:
+        k = [1000, 10000]
+    disable_rdkit_log()
+    metrics = {}
+    close_pool = False
+    if pool is None:
+        if n_jobs != 1:
+            pool = Pool(n_jobs)
+            close_pool = True
+        else:
+            pool = 1
+    metrics['valid'] = fraction_valid(gen, n_jobs=pool)
+    gen = remove_invalid(gen, canonize=True)
+    if not isinstance(k, (list, tuple)):
+        k = [k]
+    for _k in k:
+        metrics['unique@{}'.format(_k)] = fraction_unique(gen, _k, pool)
+    if ptest is None:
+        ptest = compute_intermediate_statistics(test, n_jobs=n_jobs,
+                                                device=device,
+                                                batch_size=batch_size,
+                                                pool=pool)
+    if test_scaffolds is not None and ptest_scaffolds is None:
+        ptest_scaffolds = compute_intermediate_statistics(
+            test_scaffolds, n_jobs=n_jobs,
+            device=device, batch_size=batch_size,
+            pool=pool
+        )
+    mols = mapper(pool)(get_mol, gen)
+    kwargs = {'n_jobs': pool, 'device': device, 'batch_size': batch_size}
+    kwargs_fcd = {'n_jobs': n_jobs, 'device': device, 'batch_size': batch_size}
+    metrics['FCD/Test'] = FCDMetric(**kwargs_fcd)(gen=gen, pref=ptest['FCD'])
+    metrics['SNN/Test'] = SNNMetric(**kwargs)(gen=mols, pref=ptest['SNN'])
+    metrics['Frag/Test'] = FragMetric(**kwargs)(gen=mols, pref=ptest['Frag'])
+    metrics['Scaf/Test'] = ScafMetric(**kwargs)(gen=mols, pref=ptest['Scaf'])
+    if ptest_scaffolds is not None:
+        metrics['FCD/TestSF'] = FCDMetric(**kwargs_fcd)(
+            gen=gen, pref=ptest_scaffolds['FCD']
+        )
+        metrics['SNN/TestSF'] = SNNMetric(**kwargs)(
+            gen=mols, pref=ptest_scaffolds['SNN']
+        )
+        metrics['Frag/TestSF'] = FragMetric(**kwargs)(
+            gen=mols, pref=ptest_scaffolds['Frag']
+        )
+        metrics['Scaf/TestSF'] = ScafMetric(**kwargs)(
+            gen=mols, pref=ptest_scaffolds['Scaf']
+        )
+    metrics['IntDiv'] = internal_diversity(mols, pool, device=device)
+    metrics['IntDiv2'] = internal_diversity(mols, pool, device=device, p=2)
+    metrics['Filters'] = fraction_passes_filters(mols, pool)
+    # Properties
+    for name, func in [('logP', logP), ('SA', SA),
+                       ('QED', QED),
+                       ('weight', weight)]:
+        metrics[name] = WassersteinMetric(func, **kwargs)(
+            gen=mols, pref=ptest[name])
+    if train is not None:
+        metrics['Novelty'] = novelty(mols, train, pool)
+    enable_rdkit_log()
+    if close_pool:
+        pool.close()
+        pool.join()
+    return metrics
+def compute_intermediate_statistics(smiles, n_jobs=1, device='cpu',
+                                    batch_size=512, pool=None):
+    """
+    The function precomputes statistics such as mean and variance for FCD, etc.
+    It is useful to compute the statistics for test and scaffold test sets to
+        speedup metrics calculation.
+    """
+    close_pool = False
+    if pool is None:
+        if n_jobs != 1:
+            pool = Pool(n_jobs)
+            close_pool = True
+        else:
+            pool = 1
+    statistics = {}
+    mols = mapper(pool)(get_mol, smiles)
+    kwargs = {'n_jobs': pool, 'device': device, 'batch_size': batch_size}
+    kwargs_fcd = {'n_jobs': n_jobs, 'device': device, 'batch_size': batch_size}
+    statistics['FCD'] = FCDMetric(**kwargs_fcd).precalc(smiles)
+    statistics['SNN'] = SNNMetric(**kwargs).precalc(mols)
+    statistics['Frag'] = FragMetric(**kwargs).precalc(mols)
+    statistics['Scaf'] = ScafMetric(**kwargs).precalc(mols)
+    for name, func in [('logP', logP), ('SA', SA),
+                       ('QED', QED),
+                       ('weight', weight)]:
+        statistics[name] = WassersteinMetric(func, **kwargs).precalc(mols)
+    if close_pool:
+        pool.terminate()
+    return statistics
+def fraction_passes_filters(gen, n_jobs=1):
+    """
+    Computes the fraction of molecules that pass filters:
+    * MCF
+    * PAINS
+    * Only allowed atoms ('C','N','S','O','F','Cl','Br','H')
+    * No charges
+    """
+    passes = mapper(n_jobs)(mol_passes_filters, gen)
+    return np.mean(passes)
+def internal_diversity(gen, n_jobs=1, device='cpu', fp_type='morgan',
+                       gen_fps=None, p=1):
+    """
+    Computes internal diversity as:
+    1/|A|^2 sum_{x, y in AxA} (1-tanimoto(x, y))
+    """
+    if gen_fps is None:
+        gen_fps = fingerprints(gen, fp_type=fp_type, n_jobs=n_jobs)
+    return 1 - (average_agg_tanimoto(gen_fps, gen_fps,
+                                     agg='mean', device=device, p=p)).mean()
+def fraction_unique(gen, k=None, n_jobs=1, check_validity=True):
+    """
+    Computes a number of unique molecules
+    Parameters:
+        gen: list of SMILES
+        k: compute unique@k
+        n_jobs: number of threads for calculation
+        check_validity: raises ValueError if invalid molecules are present
+    """
+    if k is not None:
+        if len(gen) < k:
+            warnings.warn(
+                "Can't compute unique@{}.".format(k) +
+                "gen contains only {} molecules".format(len(gen))
+            )
+        gen = gen[:k]
+    canonic = set(mapper(n_jobs)(canonic_smiles, gen))
+    if None in canonic and check_validity:
+        raise ValueError("Invalid molecule passed to unique@k")
+    return len(canonic) / len(gen)
+def fraction_valid(gen, n_jobs=1):
+    """
+    Computes a number of valid molecules
+    Parameters:
+        gen: list of SMILES
+        n_jobs: number of threads for calculation
+    """
+    gen = mapper(n_jobs)(get_mol, gen)
+    return 1 - gen.count(None) / len(gen)
+def novelty(gen, train, n_jobs=1):
+    gen_smiles = mapper(n_jobs)(canonic_smiles, gen)
+    gen_smiles_set = set(gen_smiles) - {None}
+    train_set = set(train)
+    return len(gen_smiles_set - train_set) / len(gen_smiles_set)
+def remove_invalid(gen, canonize=True, n_jobs=1):
+    """
+    Removes invalid molecules from the dataset
+    """
+    if not canonize:
+        mols = mapper(n_jobs)(get_mol, gen)
+        return [gen_ for gen_, mol in zip(gen, mols) if mol is not None]
+    return [x for x in mapper(n_jobs)(canonic_smiles, gen) if
+            x is not None]
+class Metric:
+    def __init__(self, n_jobs=1, device='cpu', batch_size=512, **kwargs):
+        self.n_jobs = n_jobs
+        self.device = device
+        self.batch_size = batch_size
+        for k, v in kwargs.values():
+            setattr(self, k, v)
+    def __call__(self, ref=None, gen=None, pref=None, pgen=None):
+        assert (ref is None) != (pref is None), "specify ref xor pref"
+        assert (gen is None) != (pgen is None), "specify gen xor pgen"
+        if pref is None:
+            pref = self.precalc(ref)
+        if pgen is None:
+            pgen = self.precalc(gen)
+        return self.metric(pref, pgen)
+    def precalc(self, moleclues):
+        raise NotImplementedError
+    def metric(self, pref, pgen):
+        raise NotImplementedError
+class SNNMetric(Metric):
+    """
+    Computes average max similarities of gen SMILES to ref SMILES
+    """
+    def __init__(self, fp_type='morgan', **kwargs):
+        self.fp_type = fp_type
+        super().__init__(**kwargs)
+    def precalc(self, mols):
+        return {'fps': fingerprints(mols, n_jobs=self.n_jobs,
+                                    fp_type=self.fp_type)}
+    def metric(self, pref, pgen):
+        return average_agg_tanimoto(pref['fps'], pgen['fps'],
+                                    device=self.device)
+def cos_similarity(ref_counts, gen_counts):
+    """
+    Computes cosine similarity between
+     dictionaries of form {name: count}. Non-present
+     elements are considered zero:
+     sim = <r, g> / ||r|| / ||g||
+    """
+    if len(ref_counts) == 0 or len(gen_counts) == 0:
+        return np.nan
+    keys = np.unique(list(ref_counts.keys()) + list(gen_counts.keys()))
+    ref_vec = np.array([ref_counts.get(k, 0) for k in keys])
+    gen_vec = np.array([gen_counts.get(k, 0) for k in keys])
+    return 1 - cos_distance(ref_vec, gen_vec)
+class FragMetric(Metric):
+    def precalc(self, mols):
+        return {'frag': compute_fragments(mols, n_jobs=self.n_jobs)}
+    def metric(self, pref, pgen):
+        return cos_similarity(pref['frag'], pgen['frag'])
+class ScafMetric(Metric):
+    def precalc(self, mols):
+        return {'scaf': compute_scaffolds(mols, n_jobs=self.n_jobs)}
+    def metric(self, pref, pgen):
+        return cos_similarity(pref['scaf'], pgen['scaf'])
+class WassersteinMetric(Metric):
+    def __init__(self, func=None, **kwargs):
+        self.func = func
+        super().__init__(**kwargs)
+    def precalc(self, mols):
+        if self.func is not None:
+            values = mapper(self.n_jobs)(self.func, mols)
+        else:
+            values = mols
+        return {'values': values}
+    def metric(self, pref, pgen):
+        return wasserstein_distance(
+            pref['values'], pgen['values']
+        )

molgen_metric.py CHANGED Viewed

@@ -94,8 +94,6 @@ class molgen_metric(evaluate.Measurement):
     def _compute(self, generated_smiles, train_smiles = None):
         Results = metrics.get_all_metrics(gen = generated_smiles, train= train_smiles)
         generated_smiles = [s for s in generated_smiles if s != '']

     def _compute(self, generated_smiles, train_smiles = None):
         Results = metrics.get_all_metrics(gen = generated_smiles, train= train_smiles)
         generated_smiles = [s for s in generated_smiles if s != '']

utils.py ADDED Viewed

	@@ -0,0 +1,316 @@

+import random
+from multiprocessing import Pool
+from collections import UserList, defaultdict
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+import torch
+from rdkit import rdBase
+from rdkit import Chem
+# https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader
+def set_torch_seed_to_all_gens(_):
+    seed = torch.initial_seed() % (2**32 - 1)
+    random.seed(seed)
+    np.random.seed(seed)
+class SpecialTokens:
+    bos = '<bos>'
+    eos = '<eos>'
+    pad = '<pad>'
+    unk = '<unk>'
+class CharVocab:
+    @classmethod
+    def from_data(cls, data, *args, **kwargs):
+        chars = set()
+        for string in data:
+            chars.update(string)
+        return cls(chars, *args, **kwargs)
+    def __init__(self, chars, ss=SpecialTokens):
+        if (ss.bos in chars) or (ss.eos in chars) or \
+                (ss.pad in chars) or (ss.unk in chars):
+            raise ValueError('SpecialTokens in chars')
+        all_syms = sorted(list(chars)) + [ss.bos, ss.eos, ss.pad, ss.unk]
+        self.ss = ss
+        self.c2i = {c: i for i, c in enumerate(all_syms)}
+        self.i2c = {i: c for i, c in enumerate(all_syms)}
+    def __len__(self):
+        return len(self.c2i)
+    @property
+    def bos(self):
+        return self.c2i[self.ss.bos]
+    @property
+    def eos(self):
+        return self.c2i[self.ss.eos]
+    @property
+    def pad(self):
+        return self.c2i[self.ss.pad]
+    @property
+    def unk(self):
+        return self.c2i[self.ss.unk]
+    def char2id(self, char):
+        if char not in self.c2i:
+            return self.unk
+        return self.c2i[char]
+    def id2char(self, id):
+        if id not in self.i2c:
+            return self.ss.unk
+        return self.i2c[id]
+    def string2ids(self, string, add_bos=False, add_eos=False):
+        ids = [self.char2id(c) for c in string]
+        if add_bos:
+            ids = [self.bos] + ids
+        if add_eos:
+            ids = ids + [self.eos]
+        return ids
+    def ids2string(self, ids, rem_bos=True, rem_eos=True):
+        if len(ids) == 0:
+            return ''
+        if rem_bos and ids[0] == self.bos:
+            ids = ids[1:]
+        if rem_eos and ids[-1] == self.eos:
+            ids = ids[:-1]
+        string = ''.join([self.id2char(id) for id in ids])
+        return string
+class OneHotVocab(CharVocab):
+    def __init__(self, *args, **kwargs):
+        super(OneHotVocab, self).__init__(*args, **kwargs)
+        self.vectors = torch.eye(len(self.c2i))
+def mapper(n_jobs):
+    '''
+    Returns function for map call.
+    If n_jobs == 1, will use standard map
+    If n_jobs > 1, will use multiprocessing pool
+    If n_jobs is a pool object, will return its map function
+    '''
+    if n_jobs == 1:
+        def _mapper(*args, **kwargs):
+            return list(map(*args, **kwargs))
+        return _mapper
+    if isinstance(n_jobs, int):
+        pool = Pool(n_jobs)
+        def _mapper(*args, **kwargs):
+            try:
+                result = pool.map(*args, **kwargs)
+            finally:
+                pool.terminate()
+            return result
+        return _mapper
+    return n_jobs.map
+class Logger(UserList):
+    def __init__(self, data=None):
+        super().__init__()
+        self.sdata = defaultdict(list)
+        for step in (data or []):
+            self.append(step)
+    def __getitem__(self, key):
+        if isinstance(key, int):
+            return self.data[key]
+        if isinstance(key, slice):
+            return Logger(self.data[key])
+        ldata = self.sdata[key]
+        if isinstance(ldata[0], dict):
+            return Logger(ldata)
+        return ldata
+    def append(self, step_dict):
+        super().append(step_dict)
+        for k, v in step_dict.items():
+            self.sdata[k].append(v)
+    def save(self, path):
+        df = pd.DataFrame(list(self))
+        df.to_csv(path, index=None)
+class LogPlotter:
+    def __init__(self, log):
+        self.log = log
+    def line(self, ax, name):
+        if isinstance(self.log[0][name], dict):
+            for k in self.log[0][name]:
+                ax.plot(self.log[name][k], label=k)
+            ax.legend()
+        else:
+            ax.plot(self.log[name])
+        ax.set_ylabel('value')
+        ax.set_xlabel('epoch')
+        ax.set_title(name)
+    def grid(self, names, size=7):
+        _, axs = plt.subplots(nrows=len(names) // 2, ncols=2,
+                              figsize=(size * 2, size * (len(names) // 2)))
+        for ax, name in zip(axs.flatten(), names):
+            self.line(ax, name)
+class CircularBuffer:
+    def __init__(self, size):
+        self.max_size = size
+        self.data = np.zeros(self.max_size)
+        self.size = 0
+        self.pointer = -1
+    def add(self, element):
+        self.size = min(self.size + 1, self.max_size)
+        self.pointer = (self.pointer + 1) % self.max_size
+        self.data[self.pointer] = element
+        return element
+    def last(self):
+        assert self.pointer != -1, "Can't get an element from an empty buffer!"
+        return self.data[self.pointer]
+    def mean(self):
+        if self.size > 0:
+            return self.data[:self.size].mean()
+        return 0.0
+def disable_rdkit_log():
+    rdBase.DisableLog('rdApp.*')
+def enable_rdkit_log():
+    rdBase.EnableLog('rdApp.*')
+def get_mol(smiles_or_mol):
+    '''
+    Loads SMILES/molecule into RDKit's object
+    '''
+    if isinstance(smiles_or_mol, str):
+        if len(smiles_or_mol) == 0:
+            return None
+        mol = Chem.MolFromSmiles(smiles_or_mol)
+        if mol is None:
+            return None
+        try:
+            Chem.SanitizeMol(mol)
+        except ValueError:
+            return None
+        return mol
+    return smiles_or_mol
+class StringDataset:
+    def __init__(self, vocab, data):
+        """
+        Creates a convenient Dataset with SMILES tokinization
+        Arguments:
+            vocab: CharVocab instance for tokenization
+            data (list): SMILES strings for the dataset
+        """
+        self.vocab = vocab
+        self.tokens = [vocab.string2ids(s) for s in data]
+        self.data = data
+        self.bos = vocab.bos
+        self.eos = vocab.eos
+    def __len__(self):
+        """
+        Computes a number of objects in the dataset
+        """
+        return len(self.tokens)
+    def __getitem__(self, index):
+        """
+        Prepares torch tensors with a given SMILES.
+        Arguments:
+            index (int): index of SMILES in the original dataset
+        Returns:
+            A tuple (with_bos, with_eos, smiles), where
+            * with_bos is a torch.long tensor of SMILES tokens with
+                BOS (beginning of a sentence) token
+            * with_eos is a torch.long tensor of SMILES tokens with
+                EOS (end of a sentence) token
+            * smiles is an original SMILES from the dataset
+        """
+        tokens = self.tokens[index]
+        with_bos = torch.tensor([self.bos] + tokens, dtype=torch.long)
+        with_eos = torch.tensor(tokens + [self.eos], dtype=torch.long)
+        return with_bos, with_eos, self.data[index]
+    def default_collate(self, batch, return_data=False):
+        """
+        Simple collate function for SMILES dataset. Joins a
+        batch of objects from StringDataset into a batch
+        Arguments:
+            batch: list of objects from StringDataset
+            pad: padding symbol, usually equals to vocab.pad
+            return_data: if True, will return SMILES used in a batch
+        Returns:
+            with_bos, with_eos, lengths [, data] where
+            * with_bos: padded sequence with BOS in the beginning
+            * with_eos: padded sequence with EOS in the end
+            * lengths: array with SMILES lengths in the batch
+            * data: SMILES in the batch
+        Note: output batch is sorted with respect to SMILES lengths in
+            decreasing order, since this is a default format for torch
+            RNN implementations
+        """
+        with_bos, with_eos, data = list(zip(*batch))
+        lengths = [len(x) for x in with_bos]
+        order = np.argsort(lengths)[::-1]
+        with_bos = [with_bos[i] for i in order]
+        with_eos = [with_eos[i] for i in order]
+        lengths = [lengths[i] for i in order]
+        with_bos = torch.nn.utils.rnn.pad_sequence(
+            with_bos, padding_value=self.vocab.pad
+        )
+        with_eos = torch.nn.utils.rnn.pad_sequence(
+            with_eos, padding_value=self.vocab.pad
+        )
+        if return_data:
+            data = np.array(data)[order]
+            return with_bos, with_eos, lengths, data
+        return with_bos, with_eos, lengths
+def batch_to_device(batch, device):
+    return [
+        x.to(device) if isinstance(x, torch.Tensor) else x
+        for x in batch
+    ]

utils2.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import os
+from collections import Counter
+from functools import partial
+import numpy as np
+import pandas as pd
+import scipy.sparse
+import torch
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from rdkit.Chem import MACCSkeys
+from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect as Morgan
+from rdkit.Chem.QED import qed
+from rdkit.Chem.Scaffolds import MurckoScaffold
+from rdkit.Chem import Descriptors
+from moses.metrics.SA_Score import sascorer
+from moses.metrics.NP_Score import npscorer
+from moses.utils import mapper, get_mol
+_base_dir = os.path.split(__file__)[0]
+_mcf = pd.read_csv(os.path.join(_base_dir, 'mcf.csv'))
+_pains = pd.read_csv(os.path.join(_base_dir, 'wehi_pains.csv'),
+                     names=['smarts', 'names'])
+_filters = [Chem.MolFromSmarts(x) for x in
+            _mcf.append(_pains, sort=True)['smarts'].values]
+def canonic_smiles(smiles_or_mol):
+    mol = get_mol(smiles_or_mol)
+    if mol is None:
+        return None
+    return Chem.MolToSmiles(mol)
+def logP(mol):
+    """
+    Computes RDKit's logP
+    """
+    return Chem.Crippen.MolLogP(mol)
+def SA(mol):
+    """
+    Computes RDKit's Synthetic Accessibility score
+    """
+    return sascorer.calculateScore(mol)
+def NP(mol):
+    """
+    Computes RDKit's Natural Product-likeness score
+    """
+    return npscorer.scoreMol(mol)
+def QED(mol):
+    """
+    Computes RDKit's QED score
+    """
+    return qed(mol)
+def weight(mol):
+    """
+    Computes molecular weight for given molecule.
+    Returns float,
+    """
+    return Descriptors.MolWt(mol)
+def get_n_rings(mol):
+    """
+    Computes the number of rings in a molecule
+    """
+    return mol.GetRingInfo().NumRings()
+def fragmenter(mol):
+    """
+    fragment mol using BRICS and return smiles list
+    """
+    fgs = AllChem.FragmentOnBRICSBonds(get_mol(mol))
+    fgs_smi = Chem.MolToSmiles(fgs).split(".")
+    return fgs_smi
+def compute_fragments(mol_list, n_jobs=1):
+    """
+    fragment list of mols using BRICS and return smiles list
+    """
+    fragments = Counter()
+    for mol_frag in mapper(n_jobs)(fragmenter, mol_list):
+        fragments.update(mol_frag)
+    return fragments
+def compute_scaffolds(mol_list, n_jobs=1, min_rings=2):
+    """
+    Extracts a scafold from a molecule in a form of a canonic SMILES
+    """
+    scaffolds = Counter()
+    map_ = mapper(n_jobs)
+    scaffolds = Counter(
+        map_(partial(compute_scaffold, min_rings=min_rings), mol_list))
+    if None in scaffolds:
+        scaffolds.pop(None)
+    return scaffolds
+def compute_scaffold(mol, min_rings=2):
+    mol = get_mol(mol)
+    try:
+        scaffold = MurckoScaffold.GetScaffoldForMol(mol)
+    except (ValueError, RuntimeError):
+        return None
+    n_rings = get_n_rings(scaffold)
+    scaffold_smiles = Chem.MolToSmiles(scaffold)
+    if scaffold_smiles == '' or n_rings < min_rings:
+        return None
+    return scaffold_smiles
+def average_agg_tanimoto(stock_vecs, gen_vecs,
+                         batch_size=5000, agg='max',
+                         device='cpu', p=1):
+    """
+    For each molecule in gen_vecs finds closest molecule in stock_vecs.
+    Returns average tanimoto score for between these molecules
+    Parameters:
+        stock_vecs: numpy array <n_vectors x dim>
+        gen_vecs: numpy array <n_vectors' x dim>
+        agg: max or mean
+        p: power for averaging: (mean x^p)^(1/p)
+    """
+    assert agg in ['max', 'mean'], "Can aggregate only max or mean"
+    agg_tanimoto = np.zeros(len(gen_vecs))
+    total = np.zeros(len(gen_vecs))
+    for j in range(0, stock_vecs.shape[0], batch_size):
+        x_stock = torch.tensor(stock_vecs[j:j + batch_size]).to(device).float()
+        for i in range(0, gen_vecs.shape[0], batch_size):
+            y_gen = torch.tensor(gen_vecs[i:i + batch_size]).to(device).float()
+            y_gen = y_gen.transpose(0, 1)
+            tp = torch.mm(x_stock, y_gen)
+            jac = (tp / (x_stock.sum(1, keepdim=True) +
+                         y_gen.sum(0, keepdim=True) - tp)).cpu().numpy()
+            jac[np.isnan(jac)] = 1
+            if p != 1:
+                jac = jac**p
+            if agg == 'max':
+                agg_tanimoto[i:i + y_gen.shape[1]] = np.maximum(
+                    agg_tanimoto[i:i + y_gen.shape[1]], jac.max(0))
+            elif agg == 'mean':
+                agg_tanimoto[i:i + y_gen.shape[1]] += jac.sum(0)
+                total[i:i + y_gen.shape[1]] += jac.shape[0]
+    if agg == 'mean':
+        agg_tanimoto /= total
+    if p != 1:
+        agg_tanimoto = (agg_tanimoto)**(1/p)
+    return np.mean(agg_tanimoto)
+def fingerprint(smiles_or_mol, fp_type='maccs', dtype=None, morgan__r=2,
+                morgan__n=1024, *args, **kwargs):
+    """
+    Generates fingerprint for SMILES
+    If smiles is invalid, returns None
+    Returns numpy array of fingerprint bits
+    Parameters:
+        smiles: SMILES string
+        type: type of fingerprint: [MACCS|morgan]
+        dtype: if not None, specifies the dtype of returned array
+    """
+    fp_type = fp_type.lower()
+    molecule = get_mol(smiles_or_mol, *args, **kwargs)
+    if molecule is None:
+        return None
+    if fp_type == 'maccs':
+        keys = MACCSkeys.GenMACCSKeys(molecule)
+        keys = np.array(keys.GetOnBits())
+        fingerprint = np.zeros(166, dtype='uint8')
+        if len(keys) != 0:
+            fingerprint[keys - 1] = 1  # We drop 0-th key that is always zero
+    elif fp_type == 'morgan':
+        fingerprint = np.asarray(Morgan(molecule, morgan__r, nBits=morgan__n),
+                                 dtype='uint8')
+    else:
+        raise ValueError("Unknown fingerprint type {}".format(fp_type))
+    if dtype is not None:
+        fingerprint = fingerprint.astype(dtype)
+    return fingerprint
+def fingerprints(smiles_mols_array, n_jobs=1, already_unique=False, *args,
+                 **kwargs):
+    '''
+    Computes fingerprints of smiles np.array/list/pd.Series with n_jobs workers
+    e.g.fingerprints(smiles_mols_array, type='morgan', n_jobs=10)
+    Inserts np.NaN to rows corresponding to incorrect smiles.
+    IMPORTANT: if there is at least one np.NaN, the dtype would be float
+    Parameters:
+        smiles_mols_array: list/array/pd.Series of smiles or already computed
+            RDKit molecules
+        n_jobs: number of parralel workers to execute
+        already_unique: flag for performance reasons, if smiles array is big
+            and already unique. Its value is set to True if smiles_mols_array
+            contain RDKit molecules already.
+    '''
+    if isinstance(smiles_mols_array, pd.Series):
+        smiles_mols_array = smiles_mols_array.values
+    else:
+        smiles_mols_array = np.asarray(smiles_mols_array)
+    if not isinstance(smiles_mols_array[0], str):
+        already_unique = True
+    if not already_unique:
+        smiles_mols_array, inv_index = np.unique(smiles_mols_array,
+                                                 return_inverse=True)
+    fps = mapper(n_jobs)(
+        partial(fingerprint, *args, **kwargs), smiles_mols_array
+    )
+    length = 1
+    for fp in fps:
+        if fp is not None:
+            length = fp.shape[-1]
+            first_fp = fp
+            break
+    fps = [fp if fp is not None else np.array([np.NaN]).repeat(length)[None, :]
+           for fp in fps]
+    if scipy.sparse.issparse(first_fp):
+        fps = scipy.sparse.vstack(fps).tocsr()
+    else:
+        fps = np.vstack(fps)
+    if not already_unique:
+        return fps[inv_index]
+    return fps
+def mol_passes_filters(mol,
+                       allowed=None,
+                       isomericSmiles=False):
+    """
+    Checks if mol
+    * passes MCF and PAINS filters,
+    * has only allowed atoms
+    * is not charged
+    """
+    allowed = allowed or {'C', 'N', 'S', 'O', 'F', 'Cl', 'Br', 'H'}
+    mol = get_mol(mol)
+    if mol is None:
+        return False
+    ring_info = mol.GetRingInfo()
+    if ring_info.NumRings() != 0 and any(
+            len(x) >= 8 for x in ring_info.AtomRings()
+    ):
+        return False
+    h_mol = Chem.AddHs(mol)
+    if any(atom.GetFormalCharge() != 0 for atom in mol.GetAtoms()):
+        return False
+    if any(atom.GetSymbol() not in allowed for atom in mol.GetAtoms()):
+        return False
+    if any(h_mol.HasSubstructMatch(smarts) for smarts in _filters):
+        return False
+    smiles = Chem.MolToSmiles(mol, isomericSmiles=isomericSmiles)
+    if smiles is None or len(smiles) == 0:
+        return False
+    if Chem.MolFromSmiles(smiles) is None:
+        return False
+    return True