DeepSEQreen_fast_build

Running on CPU Upgrade

App Files Files Community

libokj commited on Dec 22, 2023

Commit

c0ec7e6

1 Parent(s): f386218

Upload 110 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

deepscreen/__init__.py +101 -0
deepscreen/__pycache__/__init__.cpython-311.pyc +0 -0
deepscreen/__pycache__/predict.cpython-311.pyc +0 -0
deepscreen/data/__init__.py +0 -0
deepscreen/data/__pycache__/__init__.cpython-311.pyc +0 -0
deepscreen/data/__pycache__/dti.cpython-311.pyc +0 -0
deepscreen/data/dti.py +422 -0
deepscreen/data/dti.py.bak +369 -0
deepscreen/data/dti_datamodule.py +314 -0
deepscreen/data/entity_datamodule.py +167 -0
deepscreen/data/featurizers/__init__.py +0 -0
deepscreen/data/featurizers/__pycache__/__init__.cpython-311.pyc +0 -0
deepscreen/data/featurizers/__pycache__/categorical.cpython-311.pyc +0 -0
deepscreen/data/featurizers/__pycache__/token.cpython-311.pyc +0 -0
deepscreen/data/featurizers/categorical.py +86 -0
deepscreen/data/featurizers/chem.py +48 -0
deepscreen/data/featurizers/fcs.py +67 -0
deepscreen/data/featurizers/fingerprint/__init__.py +45 -0
deepscreen/data/featurizers/fingerprint/atompairs.py +18 -0
deepscreen/data/featurizers/fingerprint/avalonfp.py +16 -0
deepscreen/data/featurizers/fingerprint/estatefp.py +12 -0
deepscreen/data/featurizers/fingerprint/maccskeys.py +25 -0
deepscreen/data/featurizers/fingerprint/maccskeys.xlsx +0 -0
deepscreen/data/featurizers/fingerprint/map4.py +130 -0
deepscreen/data/featurizers/fingerprint/mhfp6.py +18 -0
deepscreen/data/featurizers/fingerprint/mnimalfatures.fdef +53 -0
deepscreen/data/featurizers/fingerprint/morganfp.py +18 -0
deepscreen/data/featurizers/fingerprint/pharmErGfp.py +60 -0
deepscreen/data/featurizers/fingerprint/pharmPointfp.py +59 -0
deepscreen/data/featurizers/fingerprint/pubchemfp.py +1731 -0
deepscreen/data/featurizers/fingerprint/pubchemfp.xlsx +0 -0
deepscreen/data/featurizers/fingerprint/rdkitfp.py +42 -0
deepscreen/data/featurizers/fingerprint/smarts_maccskey.py +178 -0
deepscreen/data/featurizers/fingerprint/smarts_pharmacophore.py +21 -0
deepscreen/data/featurizers/fingerprint/smarts_pubchem.py +734 -0
deepscreen/data/featurizers/fingerprint/torsions.py +18 -0
deepscreen/data/featurizers/graph.py +133 -0
deepscreen/data/featurizers/monn.py +106 -0
deepscreen/data/featurizers/token.py +299 -0
deepscreen/data/single_entity.py +195 -0
deepscreen/data/utils/__init__.py +8 -0
deepscreen/data/utils/__pycache__/__init__.cpython-311.pyc +0 -0
deepscreen/data/utils/__pycache__/collator.cpython-311.pyc +0 -0
deepscreen/data/utils/__pycache__/label.cpython-311.pyc +0 -0
deepscreen/data/utils/__pycache__/sampler.cpython-311.pyc +0 -0
deepscreen/data/utils/__pycache__/split.cpython-311.pyc +0 -0
deepscreen/data/utils/collator.py +168 -0
deepscreen/data/utils/dataset.py +216 -0
deepscreen/data/utils/label.py +93 -0
deepscreen/data/utils/sampler.py +90 -0

deepscreen/__init__.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+DeepScreen package initialization, registering custom objects and monkey patching for some libraries.
+"""
+import sys
+from builtins import eval
+import lightning.fabric.strategies.launchers.subprocess_script as subprocess_script
+import torch
+from omegaconf import OmegaConf
+from deepscreen.utils import get_logger
+log = get_logger(__name__)
+# Allow basic Python operations in hydra interpolation; examples:
+# `in_channels: ${eval:${model.drug_encoder.out_channels}+${model.protein_encoder.out_channels}}`
+# `subdir: ${eval:${hydra.job.override_dirname}.replace('/', '.')}`
+OmegaConf.register_new_resolver("eval", eval)
+def sanitize_path(path_str: str):
+    """
+    Sanitize a string for path creation by replacing unsafe characters and cutting length to 255 (OS limitation).
+    """
+    return path_str.replace("/", ".").replace("\\", ".").replace(":", "-")[:255]
+OmegaConf.register_new_resolver("sanitize_path", sanitize_path)
+def _hydra_subprocess_cmd(local_rank: int):
+    """
+    Monkey patching for lightning.fabric.strategies.launchers.subprocess_script._hydra_subprocess_cmd
+    Temporarily fixes the problem of unnecessarily creating log folders for DDP subprocesses in Hydra multirun/sweep.
+    """
+    import __main__  # local import to avoid https://github.com/Lightning-AI/lightning/issues/15218
+    from hydra.core.hydra_config import HydraConfig
+    from hydra.utils import get_original_cwd, to_absolute_path
+    # when user is using hydra find the absolute path
+    if __main__.__spec__ is None:  # pragma: no-cover
+        command = [sys.executable, to_absolute_path(sys.argv[0])]
+    else:
+        command = [sys.executable, "-m", __main__.__spec__.name]
+    command += sys.argv[1:]
+    cwd = get_original_cwd()
+    rundir = f'"{HydraConfig.get().runtime.output_dir}"'
+    # Set output_subdir null since we don't want different subprocesses trying to write to config.yaml
+    command += [f"hydra.job.name=train_ddp_process_{local_rank}",
+                "hydra.output_subdir=null,"
+                f"hydra.runtime.output_dir={rundir}"]
+    return command, cwd
+subprocess_script._hydra_subprocess_cmd = _hydra_subprocess_cmd
+# from torch import Tensor
+# from lightning.fabric.utilities.distributed import _distributed_available
+# from lightning.pytorch.utilities.rank_zero import WarningCache
+# from lightning.pytorch.utilities.warnings import PossibleUserWarning
+# from lightning.pytorch.trainer.connectors.logger_connector.result import _ResultCollection
+# warning_cache = WarningCache()
+#
+# @staticmethod
+# def _get_cache(result_metric, on_step: bool):
+#     cache = None
+#     if on_step and result_metric.meta.on_step:
+#         cache = result_metric._forward_cache
+#     elif not on_step and result_metric.meta.on_epoch:
+#         if result_metric._computed is None:
+#             should = result_metric.meta.sync.should
+#             if not should and _distributed_available() and result_metric.is_tensor:
+#                 warning_cache.warn(
+#                     f"It is recommended to use `self.log({result_metric.meta.name!r}, ..., sync_dist=True)`"
+#                     " when logging on epoch level in distributed setting to accumulate the metric across"
+#                     " devices.",
+#                     category=PossibleUserWarning,
+#                 )
+#             result_metric.compute()
+#             result_metric.meta.sync.should = should
+#
+#         cache = result_metric._computed
+#
+#         if cache is not None:
+#             if isinstance(cache, Tensor):
+#                 if not result_metric.meta.enable_graph:
+#                     return cache.detach()
+#
+#     return cache
+#
+#
+# _ResultCollection._get_cache = _get_cache
+if torch.cuda.is_available():
+    if torch.cuda.get_device_capability() >= (8, 0):
+        torch.set_float32_matmul_precision("high")
+        log.info("Your GPU supports tensor cores, "
+                 "we will enable it automatically by setting `torch.set_float32_matmul_precision('high')`")

deepscreen/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (3.28 kB). View file

deepscreen/__pycache__/predict.cpython-311.pyc ADDED Viewed

Binary file (3.38 kB). View file

deepscreen/data/__init__.py ADDED Viewed

File without changes

deepscreen/data/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (179 Bytes). View file

deepscreen/data/__pycache__/dti.cpython-311.pyc ADDED Viewed

Binary file (23 kB). View file

deepscreen/data/dti.py ADDED Viewed

	@@ -0,0 +1,422 @@

+import re
+from functools import partial
+from numbers import Number
+from pathlib import Path
+from typing import Any, Dict, Optional, Sequence, Union, Literal
+from lightning import LightningDataModule
+import pandas as pd
+import swifter
+from sklearn.preprocessing import LabelEncoder
+from torch.utils.data import Dataset, DataLoader
+from deepscreen.data.utils import label_transform, collate_fn, SafeBatchSampler
+from deepscreen.utils import get_logger
+log = get_logger(__name__)
+SMILES_PAT = r"[^A-Za-z0-9=#:+\-\[\]<>()/\\@%,.*]"
+FASTA_PAT = r"[^A-Z*\-]"
+def validate_seq_str(seq, regex):
+    if seq:
+        err_charset = set(re.findall(regex, seq))
+        if not err_charset:
+            return None
+        else:
+            return ', '.join(err_charset)
+    else:
+        return 'Empty string'
+# TODO: save a list of corrupted records
+def rdkit_canonicalize(smiles):
+    from rdkit import Chem
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+        cano_smiles = Chem.MolToSmiles(mol)
+        return cano_smiles
+    except Exception as e:
+        log.warning(f'Failed to canonicalize SMILES using RDKIT due to {str(e)}. Returning original SMILES: {smiles}')
+        return smiles
+class DTIDataset(Dataset):
+    def __init__(
+            self,
+            task: Literal['regression', 'binary', 'multiclass'],
+            num_classes: Optional[int],
+            data_path: str | Path,
+            drug_featurizer: callable,
+            protein_featurizer: callable,
+            thresholds: Optional[Union[Number, Sequence[Number]]] = None,
+            discard_intermediate: Optional[bool] = False,
+            query: Optional[str] = 'X2'
+    ):
+        df = pd.read_csv(
+            data_path,
+            engine='python',
+            header=0,
+            usecols=lambda x: x in ['X1', 'ID1', 'X2', 'ID2', 'Y', 'U'],
+            dtype={
+                'X1': 'str',
+                'ID1': 'str',
+                'X2': 'str',
+                'ID2': 'str',
+                'Y': 'float32',
+                'U': 'str',
+            },
+        )
+        # Read the whole data table
+        # if 'ID1' in df:
+        #     self.x1_to_id1 = dict(zip(df['X1'], df['ID1']))
+        # if 'ID2' in df:
+        #     self.x2_to_id2 = dict(zip(df['X2'], df['ID2']))
+        #     self.id2_to_indexes = dict(zip(df['ID2'], range(len(df['ID2']))))
+        # self.x2_to_indexes = dict(zip(df['X2'], range(len(df['X2']))))
+        # # train and eval mode data processing (fully labelled)
+        # if 'Y' in df.columns and df['Y'].notnull().all():
+        log.info(f"Processing data file: {data_path}")
+        # Forward-fill all non-label columns
+        df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
+        # TODO potentially allow running through the whole data validation process
+        # error = False
+        if 'Y' in df:
+            log.info(f"Validating labels (`Y`)...")
+            # TODO: check sklearn.utils.multiclass.check_classification_targets
+            match task:
+                case 'regression':
+                    assert all(df['Y'].swifter.apply(lambda x: isinstance(x, Number))), \
+                        f"""`Y` must be numeric for `regression` task,
+                        but it has {set(df['Y'].swifter.apply(type))}."""
+                case 'binary':
+                    if all(df['Y'].isin([0, 1])):
+                        assert not thresholds, \
+                            f"""`Y` is already 0 or 1 for `binary` (classification) `task`,
+                            but still got `thresholds` ({thresholds}).
+                            Double check your choices of `task` and `thresholds`, and records in the `Y` column."""
+                    else:
+                        assert thresholds, \
+                            f"""`Y` must be 0 or 1 for `binary` (classification) `task`,
+                            but it has {pd.unique(df['Y'])}.
+                            You may set `thresholds` to discretize continuous labels."""  # TODO print err idx instead
+                case 'multiclass':
+                    assert num_classes >= 3, f'`num_classes` for `task=multiclass` must be at least 3.'
+                    if all(df['Y'].swifter.apply(lambda x: x.is_integer() and x >= 0)):
+                        assert not thresholds, \
+                            f"""`Y` is already non-negative integers for
+                            `multiclass` (classification) `task`, but still got `thresholds` ({thresholds}).
+                            Double check your choice of `task`, `thresholds` and records in the `Y` column."""
+                    else:
+                        assert thresholds, \
+                            f"""`Y` must be non-negative integers for
+                            `multiclass` (classification) 'task',but it has {pd.unique(df['Y'])}.
+                            You must set `thresholds` to discretize continuous labels."""  # TODO print err idx instead
+            if 'U' in df.columns:
+                units = df['U']
+            else:
+                units = None
+                log.warning("Units ('U') not in the data table. "
+                            "Assuming all labels to be discrete or in p-scale (-log10[M]).")
+            # Transform labels
+            df['Y'] = label_transform(labels=df['Y'], units=units, thresholds=thresholds,
+                                      discard_intermediate=discard_intermediate)
+            # Filter out rows with a NaN in Y (missing values)
+            df.dropna(subset=['Y'], inplace=True)
+            match task:
+                case 'regression':
+                    df['Y'] = df['Y'].astype('float32')
+                    assert all(df['Y'].swifter.apply(lambda x: isinstance(x, Number))), \
+                        f"""`Y` must be numeric for `regression` task,
+                        but after transformation it still has {set(df['Y'].swifter.apply(type))}.
+                        Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
+                    # TODO print err idx instead
+                case 'binary':
+                    df['Y'] = df['Y'].astype('int')
+                    assert all(df['Y'].isin([0, 1])), \
+                        f"""`Y` must be 0 or 1 for `task=binary`, "
+                        but after transformation it still has {pd.unique(df['Y'])}.
+                        Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
+                    # TODO print err idx instead
+                case 'multiclass':
+                    df['Y'] = df['Y'].astype('int')
+                    assert all(df['Y'].swifter.apply(lambda x: x.is_integer() and x >= 0)), \
+                        f"""Y must be non-negative integers for `task=multiclass`
+                        but after transformation it still has {pd.unique(df['Y'])}.
+                        Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
+                    # TODO print err idx instead
+                    target_n_unique = df['Y'].nunique()
+                    assert target_n_unique == num_classes, \
+                        f"""You have set `num_classes` for `task=multiclass` to {num_classes},
+                        but after transformation Y still has {target_n_unique} unique labels.
+                        Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
+        log.info("Validating SMILES (`X1`)...")
+        df['X1_ERR'] = df['X1'].swifter.progress_bar(
+            desc="Validating SMILES...").apply(validate_seq_str, regex=SMILES_PAT)
+        if not df['X1_ERR'].isna().all():
+            raise Exception(f"Encountered invalid SMILES:\n{df[~df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
+        df['X1^'] = df['X1'].apply(rdkit_canonicalize)  # swifter
+        log.info("Validating FASTA (`X2`)...")
+        df['X2'] = df['X2'].str.upper()
+        df['X2_ERR'] = df['X2'].swifter.progress_bar(
+            desc="Validating FASTA...").apply(validate_seq_str, regex=FASTA_PAT)
+        if not df['X2_ERR'].isna().all():
+            raise Exception(f"Encountered invalid FASTA:\n{df[~df['X2_ERR'].isna()][['X2', 'X2_ERR']]}")
+        # FASTA/SMILES indices as query for retrieval metrics like enrichment factor and hit rate
+        if query:
+            df['ID^'] = LabelEncoder().fit_transform(df[query])
+        self.df = df
+        self.drug_featurizer = drug_featurizer if drug_featurizer is not None else (lambda x: x)
+        self.protein_featurizer = protein_featurizer if protein_featurizer is not None else (lambda x: x)
+    def __len__(self):
+        return len(self.df.index)
+    def __getitem__(self, i):
+        sample = self.df.loc[i]
+        return {
+            'N': i,
+            'X1': sample['X1'],
+            'X1^': self.drug_featurizer(sample['X1^']),
+            'ID1': sample.get('ID1'),
+            'X2': sample['X2'],
+            'X2^': self.protein_featurizer(sample['X2']),
+            'ID2': sample.get('ID2'),
+            'Y': sample.get('Y'),
+            'ID^': sample.get('ID^'),
+        }
+class DTIDataModule(LightningDataModule):
+    """
+    DTI DataModule
+    A DataModule implements 5 key methods:
+        def prepare_data(self):
+            # things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
+            # download data, pre-process, split, save to disk, etc.
+        def setup(self, stage):
+            # things to do on every process in DDP
+            # load data, set variables, etc.
+        def train_dataloader(self):
+            # return train dataloader
+        def val_dataloader(self):
+            # return validation dataloader
+        def test_dataloader(self):
+            # return test dataloader
+        def teardown(self):
+            # called on every process in DDP
+            # clean up after fit or test
+    This allows you to share a full dataset without explaining how to download,
+    split, transform and process the data.
+    Read the docs:
+        https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
+    """
+    def __init__(
+            self,
+            task: Literal['regression', 'binary', 'multiclass'],
+            num_classes: Optional[int],
+            batch_size: int,
+            # train: bool,
+            drug_featurizer: callable,
+            protein_featurizer: callable,
+            collator: callable = collate_fn,
+            data_dir: str = "data/",
+            data_file: Optional[str] = None,
+            train_val_test_split: Optional[Union[Sequence[Number | str]]] = None,
+            split: Optional[callable] = None,
+            thresholds: Optional[Union[Number, Sequence[Number]]] = None,
+            discard_intermediate: Optional[bool] = False,
+            num_workers: int = 0,
+            pin_memory: bool = False,
+    ):
+        super().__init__()
+        self.train_data: Optional[Dataset] = None
+        self.val_data: Optional[Dataset] = None
+        self.test_data: Optional[Dataset] = None
+        self.predict_data: Optional[Dataset] = None
+        self.split = split
+        self.collator = collator
+        self.dataset = partial(
+            DTIDataset,
+            task=task,
+            num_classes=num_classes,
+            drug_featurizer=drug_featurizer,
+            protein_featurizer=protein_featurizer,
+            thresholds=thresholds,
+            discard_intermediate=discard_intermediate
+        )
+        # this line allows to access init params with 'self.hparams' ensures init params will be stored in ckpt
+        self.save_hyperparameters(logger=False)  # ignore=['split']
+    def prepare_data(self):
+        """
+        Download data if needed.
+        Do not use it to assign state (e.g., self.x = x).
+        """
+    def setup(self, stage: Optional[str] = None, encoding: str = None):
+        """
+        Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
+        This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
+        careful not to execute data splitting twice.
+        """
+        # load and split datasets only if not loaded in initialization
+        if not any([self.train_data, self.test_data, self.val_data, self.predict_data]):
+            if self.hparams.train_val_test_split:
+                if len(self.hparams.train_val_test_split) != 3:
+                    raise ValueError('Length of `train_val_test_split` must be 3. '
+                                     'Set the second element to None for training without validation. '
+                                     'Set the third element to None for training without testing.')
+                self.train_data = self.hparams.train_val_test_split[0]
+                self.val_data = self.hparams.train_val_test_split[1]
+                self.test_data = self.hparams.train_val_test_split[2]
+                if all([self.hparams.data_file, self.split]):
+                    if all(isinstance(split, Number) or split is None
+                           for split in self.hparams.train_val_test_split):
+                        split_data = self.split(
+                            dataset=self.dataset(data_path=Path(self.hparams.data_dir, self.hparams.data_file)),
+                            lengths=[split for split in self.hparams.train_val_test_split if split is not None]
+                        )
+                        for dataset in ['train_data', 'val_data', 'test_data']:
+                            if getattr(self, dataset) is not None:
+                                setattr(self, dataset, split_data.pop(0))
+                    else:
+                        raise ValueError('`train_val_test_split` must be a sequence numbers or None'
+                                         '(float for percentages and int for sample numbers) '
+                                         'if both `data_file` and `split` have been specified.')
+                elif (all(isinstance(split, str) or split is None
+                          for split in self.hparams.train_val_test_split)
+                      and not any([self.hparams.data_file, self.split])):
+                    for dataset in ['train_data', 'val_data', 'test_data']:
+                        if getattr(self, dataset) is not None:
+                            data_path = Path(getattr(self, dataset))
+                            if not data_path.is_absolute():
+                                data_path = Path(self.hparams.data_dir, data_path)
+                            setattr(self, dataset, self.dataset(data_path=data_path))
+                else:
+                    raise ValueError('For training, you must specify either all of `data_file`, `split`, '
+                                     'and `train_val_test_split` as a sequence of numbers or '
+                                     'solely `train_val_test_split` as a sequence of data file paths.')
+            elif self.hparams.data_file and not any([self.split, self.hparams.train_val_test_split]):
+                data_path = Path(self.hparams.data_file)
+                if not data_path.is_absolute():
+                    data_path = Path(self.hparams.data_dir, data_path)
+                self.test_data = self.predict_data = self.dataset(data_path=data_path)
+            else:
+                raise ValueError("For training, you must specify `train_val_test_split`. "
+                                 "For testing/predicting, you must specify only `data_file` without "
+                                 "`train_val_test_split` or `split`.")
+    def train_dataloader(self):
+        return DataLoader(
+            dataset=self.train_data,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.train_data,
+                batch_size=self.hparams.batch_size,
+                # Dropping the last batch prevents problems caused by variable batch sizes in training, e.g.,
+                # batch_size=1 in BatchNorm, and shuffling ensures the model be trained on all samples over epochs.
+                drop_last=True,
+                shuffle=True,
+            ),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=True,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=self.collator,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            dataset=self.val_data,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.val_data,
+                batch_size=self.hparams.batch_size,
+                drop_last=False,
+                shuffle=False
+            ),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=False,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=self.collator,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            dataset=self.test_data,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.test_data,
+                batch_size=self.hparams.batch_size,
+                drop_last=False,
+                shuffle=False
+            ),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=False,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=self.collator,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def predict_dataloader(self):
+        return DataLoader(
+            dataset=self.predict_data,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.predict_data,
+                batch_size=self.hparams.batch_size,
+                drop_last=False,
+                shuffle=False
+            ),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=False,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=self.collator,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def teardown(self, stage: Optional[str] = None):
+        """Clean up after fit or test."""
+        pass
+    def state_dict(self):
+        """Extra things to save to checkpoint."""
+        return {}
+    def load_state_dict(self, state_dict: Dict[str, Any]):
+        """Things to do when loading checkpoint."""
+        pass

deepscreen/data/dti.py.bak ADDED Viewed

	@@ -0,0 +1,369 @@

+from functools import partial
+from numbers import Number
+from pathlib import Path
+from typing import Any, Dict, Optional, Sequence, Union, Literal
+from lightning import LightningDataModule
+import pandas as pd
+from sklearn.preprocessing import LabelEncoder
+from torch.utils.data import Dataset, DataLoader
+from deepscreen.data.utils import label_transform, collate_fn, SafeBatchSampler
+from deepscreen.utils import get_logger
+log = get_logger(__name__)
+# TODO: save a list of corrupted records
+class DTIDataset(Dataset):
+    def __init__(
+            self,
+            task: Literal['regression', 'binary', 'multiclass'],
+            n_class: Optional[int],
+            data_path: str | Path,
+            drug_featurizer: callable,
+            protein_featurizer: callable,
+            thresholds: Optional[Union[Number, Sequence[Number]]] = None,
+            discard_intermediate: Optional[bool] = False,
+    ):
+        df = pd.read_csv(
+            data_path,
+            engine='python',
+            header=0,
+            usecols=lambda x: x in ['X1', 'ID1', 'X2', 'ID2', 'Y', 'U'],
+            dtype={
+                'X1': 'str',
+                'ID1': 'str',
+                'X2': 'str',
+                'ID2': 'str',
+                'Y': 'float32',
+                'U': 'str',
+            },
+        )
+        # Read the whole data table
+        # if 'ID1' in df:
+        #     self.x1_to_id1 = dict(zip(df['X1'], df['ID1']))
+        # if 'ID2' in df:
+        #     self.x2_to_id2 = dict(zip(df['X2'], df['ID2']))
+        #     self.id2_to_indexes = dict(zip(df['ID2'], range(len(df['ID2']))))
+        # self.x2_to_indexes = dict(zip(df['X2'], range(len(df['X2']))))
+        # # train and eval mode data processing (fully labelled)
+        # if 'Y' in df.columns and df['Y'].notnull().all():
+        log.info(f"Processing data file: {data_path}")
+        # Forward-fill all non-label columns
+        df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
+        if 'Y' in df:
+            log.info(f"Performing pre-transformation target validation.")
+            # TODO: check sklearn.utils.multiclass.check_classification_targets
+            match task:
+                case 'regression':
+                    assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
+                        f"""`Y` must be numeric for `regression` task,
+                        but it has {set(df['Y'].apply(type))}."""
+                case 'binary':
+                    if all(df['Y'].isin([0, 1])):
+                        assert not thresholds, \
+                            f"""`Y` is already 0 or 1 for `binary` (classification) `task`,
+                            but still got `thresholds` {thresholds}.
+                            Double check your choices of `task` and `thresholds` and records in the `Y` column."""
+                    else:
+                        assert thresholds, \
+                            f"""`Y` must be 0 or 1 for `binary` (classification) `task`,
+                            but it has {pd.unique(df['Y'])}.
+                            You must set `thresholds` to discretize continuous labels."""
+                case 'multiclass':
+                    assert n_class >= 3, f'`n_class` for `multiclass` (classification) `task` must be at least 3.'
+                    if all(df['Y'].apply(lambda x: x.is_integer() and x >= 0)):
+                        assert not thresholds, \
+                            f"""`Y` is already non-negative integers for
+                            `multiclass` (classification) `task`, but still got `thresholds` {thresholds}.
+                            Double check your choice of `task`, `thresholds` and records in the `Y` column."""
+                    else:
+                        assert thresholds, \
+                            f"""`Y` must be non-negative integers for
+                            `multiclass` (classification) 'task',but it has {pd.unique(df['Y'])}.
+                            You must set `thresholds` to discretize continuous labels."""
+            if 'U' in df.columns:
+                units = df['U']
+            else:
+                units = None
+                log.warning("Units ('U') not in the data table. "
+                            "Assuming all labels to be discrete or in p-scale (-log10[M]).")
+            # Transform labels
+            df['Y'] = label_transform(labels=df['Y'], units=units, thresholds=thresholds,
+                                      discard_intermediate=discard_intermediate)
+            # Filter out rows with a NaN in Y (missing values)
+            df.dropna(subset=['Y'], inplace=True)
+            log.info(f"Performing post-transformation target validation.")
+            match task:
+                case 'regression':
+                    df['Y'] = df['Y'].astype('float32')
+                    assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
+                        f"""`Y` must be numeric for `regression` task,
+                        but after transformation it still has {set(df['Y'].apply(type))}.
+                        Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
+                case 'binary':
+                    df['Y'] = df['Y'].astype('int')
+                    assert all(df['Y'].isin([0, 1])), \
+                        f"""`Y` must be 0 or 1 for `binary` (classification) `task`, "
+                        but after transformation it still has {pd.unique(df['Y'])}.
+                        Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
+                case 'multiclass':
+                    df['Y'] = df['Y'].astype('int')
+                    assert all(df['Y'].apply(lambda x: x.is_integer() and x >= 0)), \
+                        f"""Y must be non-negative integers for task `multiclass` (classification)
+                        but after transformation it still has {pd.unique(df['Y'])}.
+                        Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
+                    target_n_unique = df['Y'].nunique()
+                    assert target_n_unique == n_class, \
+                        f"""You have set `n_class` for `multiclass` (classification) `task` to {n_class},
+                        but after transformation Y still has {target_n_unique} unique labels.
+                        Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
+        # Indexed protein/FASTA for retrieval metrics
+        df['IDX'] = LabelEncoder().fit_transform(df['X2'])
+        self.df = df
+        self.drug_featurizer = drug_featurizer if drug_featurizer is not None else (lambda x: x)
+        self.protein_featurizer = protein_featurizer if protein_featurizer is not None else (lambda x: x)
+    def __len__(self):
+        return len(self.df.index)
+    def __getitem__(self, i):
+        sample = self.df.loc[i]
+        return {
+            'N': i,
+            'X1': self.drug_featurizer(sample['X1']),
+            'ID1': sample.get('ID1', sample['X1']),
+            'X2': self.protein_featurizer(sample['X2']),
+            'ID2': sample.get('ID2', sample['X2']),
+            'Y': sample.get('Y'),
+            'IDX': sample['IDX'],
+        }
+class DTIDataModule(LightningDataModule):
+    """
+    DTI DataModule
+    A DataModule implements 5 key methods:
+        def prepare_data(self):
+            # things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
+            # download data, pre-process, split, save to disk, etc.
+        def setup(self, stage):
+            # things to do on every process in DDP
+            # load data, set variables, etc.
+        def train_dataloader(self):
+            # return train dataloader
+        def val_dataloader(self):
+            # return validation dataloader
+        def test_dataloader(self):
+            # return test dataloader
+        def teardown(self):
+            # called on every process in DDP
+            # clean up after fit or test
+    This allows you to share a full dataset without explaining how to download,
+    split, transform and process the data.
+    Read the docs:
+        https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
+    """
+    def __init__(
+            self,
+            task: Literal['regression', 'binary', 'multiclass'],
+            n_class: Optional[int],
+            batch_size: int,
+            # train: bool,
+            drug_featurizer: callable,
+            protein_featurizer: callable,
+            collator: callable = collate_fn,
+            data_dir: str = "data/",
+            data_file: Optional[str] = None,
+            train_val_test_split: Optional[Union[Sequence[Number | str]]] = None,
+            split: Optional[callable] = None,
+            thresholds: Optional[Union[Number, Sequence[Number]]] = None,
+            discard_intermediate: Optional[bool] = False,
+            num_workers: int = 0,
+            pin_memory: bool = False,
+    ):
+        super().__init__()
+        self.train_data: Optional[Dataset] = None
+        self.val_data: Optional[Dataset] = None
+        self.test_data: Optional[Dataset] = None
+        self.predict_data: Optional[Dataset] = None
+        self.split = split
+        self.collator = collator
+        self.dataset = partial(
+            DTIDataset,
+            task=task,
+            n_class=n_class,
+            drug_featurizer=drug_featurizer,
+            protein_featurizer=protein_featurizer,
+            thresholds=thresholds,
+            discard_intermediate=discard_intermediate
+        )
+        if train_val_test_split:
+            # TODO test behavior for trainer.test and predict when this is passed
+            if len(train_val_test_split) not in [2, 3]:
+                raise ValueError('Length of `train_val_test_split` must be 2 (for training without testing) or 3.')
+            if all([data_file, split]):
+                if all(isinstance(split, Number) for split in train_val_test_split):
+                    pass
+                else:
+                    raise ValueError('`train_val_test_split` must be a sequence numbers '
+                                     '(float for percentages and int for sample numbers) '
+                                     'if both `data_file` and `split` have been specified.')
+            elif all(isinstance(split, str) for split in train_val_test_split) and not any([data_file, split]):
+                split_paths = []
+                for split in train_val_test_split:
+                    split = Path(split)
+                    if not split.is_absolute():
+                        split = Path(data_dir, split)
+                    split_paths.append(split)
+                self.train_data = self.dataset(data_path=split_paths[0])
+                self.val_data = self.dataset(data_path=split_paths[1])
+                if len(train_val_test_split) == 3:
+                    self.test_data = self.dataset(data_path=split_paths[2])
+            else:
+                raise ValueError('For training, you must specify either `data_file`, `split`, '
+                                 'and `train_val_test_split` as a sequence of numbers or '
+                                 'solely `train_val_test_split` as a sequence of data file paths.')
+        elif data_file and not any([split, train_val_test_split]):
+            data_file = Path(data_file)
+            if not data_file.is_absolute():
+                data_file = Path(data_dir, data_file)
+            self.test_data = self.predict_data = self.dataset(data_path=data_file)
+        else:
+            raise ValueError("For training, you must specify `train_val_test_split`. "
+                             "For testing/predicting, you must specify only `data_file` without "
+                             "`train_val_test_split` or `split`.")
+        # this line allows to access init params with 'self.hparams' attribute
+        # also ensures init params will be stored in ckpt
+        self.save_hyperparameters(logger=False)  # ignore=['split']
+    def prepare_data(self):
+        """
+        Download data if needed.
+        Do not use it to assign state (e.g., self.x = x).
+        """
+    def setup(self, stage: Optional[str] = None, encoding: str = None):
+        """
+        Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
+        This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
+        careful not to execute data splitting twice.
+        """
+        # TODO test SafeBatchSampler (which skips samples with any None without introducing variable batch size)
+        # load and split datasets only if not loaded in initialization
+        if not any([self.train_data, self.test_data, self.val_data, self.predict_data]):
+            self.train_data, self.val_data, self.test_data = self.split(
+                dataset=self.dataset(data_path=Path(self.hparams.data_dir, self.hparams.data_file)),
+                lengths=self.hparams.train_val_test_split
+            )
+    def train_dataloader(self):
+        return DataLoader(
+            dataset=self.train_data,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.train_data,
+                batch_size=self.hparams.batch_size,
+                # Dropping the last batch prevents problems caused by variable batch sizes in training, e.g.,
+                # batch_size=1 in BatchNorm, and shuffling ensures the model be trained on all samples over epochs.
+                drop_last=True,
+                shuffle=True,
+            ),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=True,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=self.collator,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            dataset=self.val_data,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.val_data,
+                batch_size=self.hparams.batch_size,
+                drop_last=False,
+                shuffle=False
+            ),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=False,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=self.collator,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            dataset=self.test_data,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.test_data,
+                batch_size=self.hparams.batch_size,
+                drop_last=False,
+                shuffle=False
+            ),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=False,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=self.collator,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def predict_dataloader(self):
+        return DataLoader(
+            dataset=self.predict_data,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.predict_data,
+                batch_size=self.hparams.batch_size,
+                drop_last=False,
+                shuffle=False
+            ),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=False,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=self.collator,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def teardown(self, stage: Optional[str] = None):
+        """Clean up after fit or test."""
+        pass
+    def state_dict(self):
+        """Extra things to save to checkpoint."""
+        return {}
+    def load_state_dict(self, state_dict: Dict[str, Any]):
+        """Things to do when loading checkpoint."""
+        pass

deepscreen/data/dti_datamodule.py ADDED Viewed

	@@ -0,0 +1,314 @@

+# from itertools import product
+from collections import namedtuple
+from numbers import Number
+from typing import Any, Dict, Optional, Sequence, Union, Literal
+# import numpy as np
+import pandas as pd
+from lightning import LightningDataModule
+from torch.utils.data import Dataset, DataLoader, random_split
+from deepscreen.data.utils.label import label_transform
+from deepscreen.data.utils.collator import collate_fn
+from deepscreen.data.utils.sampler import SafeBatchSampler
+class DTIDataset(Dataset):
+    def __init__(
+            self,
+            task: Literal['regression', 'binary', 'multiclass'],
+            n_classes: Optional[int],
+            data_dir: str,
+            dataset_name: str,
+            drug_featurizer: callable,
+            protein_featurizer: callable,
+            thresholds: Optional[Union[Number, Sequence[Number]]] = None,
+            discard_intermediate: Optional[bool] = False,
+    ):
+        df = pd.read_csv(
+            f'{data_dir}{dataset_name}.csv',
+            header=0, sep=',',
+            usecols=lambda x: x in ['X1', 'ID1', 'X2', 'ID2', 'Y', 'U'],
+            dtype={'X1': 'str', 'ID1': 'str',
+                   'X2': 'str', 'ID2': 'str',
+                   'Y': 'float32', 'U': 'str'}
+        )
+        # if 'ID1' in df:
+        #     self.x1_to_id1 = dict(zip(df['X1'], df['ID1']))
+        # if 'ID2' in df:
+        #     self.x2_to_id2 = dict(zip(df['X2'], df['ID2']))
+        #     self.id2_to_indexes = dict(zip(df['ID2'], range(len(df['ID2']))))
+        # self.x2_to_indexes = dict(zip(df['X2'], range(len(df['X2']))))
+        # # train and eval mode data processing (fully labelled)
+        # if 'Y' in df.columns and df['Y'].notnull().all():
+        # Forward-fill all non-label columns
+        df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
+        if 'Y' in df:
+            # Transform labels
+            df['Y'] = df['Y'].apply(label_transform, units=df.get('U', None), thresholds=thresholds,
+                                    discard_intermediate=discard_intermediate).astype('float32')
+            # Filter out rows with a NaN in Y (missing values)
+            df.dropna(subset=['Y'], inplace=True)
+            # Validate target labels for training/testing
+            # TODO: check sklearn.utils.multiclass.check_classification_targets
+            match task:
+                case 'regression':
+                    assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
+                        f"Y for task `regression` must be numeric; got {set(df['Y'].apply(type))}."
+                case 'binary':
+                    assert all(df['Y'].isin([0, 1])), \
+                        f"Y for task `binary` (classification) must be 0 or 1, but Y got {pd.unique(df['Y'])}." \
+                        "\nYou may set `thresholds` to discretize continuous labels."
+                case 'multiclass':
+                    assert n_classes >= 3, f'n_classes for task `multiclass` (classification) must be at least 3.'
+                    assert all(df['Y'].apply(lambda x: x.is_integer() and x >= 0)), \
+                        f"Y for task `multiclass` (classification) must be non-negative integers, " \
+                        f"but Y got {pd.unique(df['Y'])}." \
+                        "\nYou may set `thresholds` to discretize continuous labels."
+                    target_n_unique = df['Y'].nunique()
+                    assert target_n_unique == n_classes, \
+                        f"You have set n_classes for task `multiclass` (classification) task to {n_classes}, " \
+                        f"but Y has {target_n_unique} unique labels."
+        # # Predict mode data processing
+        # else:
+        #     df = pd.DataFrame(product(df['X1'].dropna(), df['X2'].dropna()), columns=['X1', 'X2'])
+        #     if hasattr(self, "x1_to_id1"):
+        #         df['ID1'] = df['X1'].map(self.x1_to_id1)
+        #     if hasattr(self, "x1_to_id2"):
+        #         df['ID2'] = df['X2'].map(self.x2_to_id2)
+        # self.smiles = df['X1']
+        # self.fasta = df['X2']
+        # self.smiles_ids = df.get('ID1', df['X1'])
+        # self.fasta_ids = df.get('ID2', df['X2'])
+        # self.labels = df.get('Y', None)
+        self.df = df
+        self.drug_featurizer = drug_featurizer if drug_featurizer is not None else (lambda x: x)
+        self.protein_featurizer = protein_featurizer if protein_featurizer is not None else (lambda x: x)
+        self.n_classes = df['Y'].nunique()
+        # self.train = train
+        self.Data = namedtuple('Data', ['FT1', 'ID1', 'FT2', 'ID2', 'Y'])
+    def __len__(self):
+        return len(self.df.index)
+    def __getitem__(self, idx):
+        sample = self.df.loc[idx]
+        return self.Data(
+            FT1=self.drug_featurizer(sample['X1']),
+            ID1=sample.get('ID1', sample['X1']),
+            FT2=self.protein_featurizer(sample['X2']),
+            ID2=sample.get('ID2', sample['X2']),
+            Y=sample.get('Y')
+        )
+        #     {
+        #     'FT1': self.drug_featurizer(sample['X1']),
+        #     'ID1': sample.get('ID1', sample['X1']),
+        #     'FT2': self.protein_featurizer(sample['X2']),
+        #     'ID2': sample.get('ID2', sample['X2']),
+        #     'Y': sample.get('Y')
+        # }
+        # if self.train:
+        # sample = self.drug_featurizer(self.smiles[idx]), self.protein_featurizer(self.fasta[idx]), self.labels[idx]
+        # sample = {
+        #     'FT1': self.drug_featurizer(self.smiles[idx]),
+        #     'FT2': self.protein_featurizer(self.fasta[idx]),
+        #     'ID2': self.smiles_ids[idx],
+        # }
+        # else:
+        #     # sample = self.drug_featurizer(self.smiles[idx]), self.protein_featurizer(self.fasta[idx])
+        #     sample = {
+        #         'FT1': self.drug_featurizer(self.smiles[idx]),
+        #         'FT2': self.protein_featurizer(self.fasta[idx]),
+        #     }
+        #
+        # if all([True if n is not None else False for n in sample.values()]):
+        #     return sample  # | {
+        #     #     'ID1': self.smiles_ids[idx],
+        #     #     'X1': self.drug_featurizer(self.smiles[idx]),
+        #     #     'ID2': self.fasta_ids[idx],
+        #     #     'X2': self.protein_featurizer(self.fasta[idx]),
+        #     # }
+        # else:
+        #     return self.__getitem__(np.random.randint(0, self.size))
+class DTIdatamodule(LightningDataModule):
+    """
+    DTI DataModule
+    A DataModule implements 5 key methods:
+        def prepare_data(self):
+            # things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
+            # download data, pre-process, split, save to disk, etc.
+        def setup(self, stage):
+            # things to do on every process in DDP
+            # load data, set variables, etc.
+        def train_dataloader(self):
+            # return train dataloader
+        def val_dataloader(self):
+            # return validation dataloader
+        def test_dataloader(self):
+            # return test dataloader
+        def teardown(self):
+            # called on every process in DDP
+            # clean up after fit or test
+    This allows you to share a full dataset without explaining how to download,
+    split, transform and process the data.
+    Read the docs:
+        https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
+    """
+    def __init__(
+            self,
+            task: Literal['regression', 'binary', 'multiclass'],
+            n_classes: Optional[int],
+            train: bool,
+            drug_featurizer: callable,
+            protein_featurizer: callable,
+            batch_size: int,
+            train_val_test_split: Optional[Sequence[Number]],
+            num_workers: int = 0,
+            thresholds: Optional[Union[Number, Sequence[Number]]] = None,
+            pin_memory: bool = False,
+            data_dir: str = "data/",
+            dataset_name: Optional[str] = None,
+            split: Optional[callable] = random_split,
+    ):
+        super().__init__()
+        # this line allows to access init params with 'self.hparams' attribute
+        # also ensures init params will be stored in ckpt
+        self.save_hyperparameters(logger=False)
+        # data processing
+        self.data_split = split
+        self.data_train: Optional[Dataset] = None
+        self.data_val: Optional[Dataset] = None
+        self.data_test: Optional[Dataset] = None
+        self.data_predict: Optional[Dataset] = None
+    def prepare_data(self):
+        """
+        Download data if needed.
+        Do not use it to assign state (e.g., self.x = x).
+        """
+    def setup(self, stage: Optional[str] = None, encoding: str = None):
+        """
+        Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
+        This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
+        careful not to execute data splitting twice.
+        """
+        # TODO test SafeBatchSampler (which skips samples with any None without introducing variable batch size)
+        # load and split datasets only if not loaded in initialization
+        if not any([self.data_train, self.data_val, self.data_test, self.data_predict]):
+            dataset = DTIDataset(
+                task=self.hparams.task,
+                n_classes=self.hparams.n_classes,
+                data_dir=self.hparams.data_dir,
+                drug_featurizer=self.hparams.drug_featurizer,
+                protein_featurizer=self.hparams.protein_featurizer,
+                dataset_name=self.hparams.dataset_name,
+                thresholds=self.hparams.thresholds,
+            )
+            if self.hparams.train:
+                self.data_train, self.data_val, self.data_test = self.data_split(
+                    dataset=dataset,
+                    lengths=self.hparams.train_val_test_split
+                )
+            else:
+                self.data_test = self.data_predict = dataset
+    def train_dataloader(self):
+        return DataLoader(
+            dataset=self.data_train,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.data_train,
+                batch_size=self.hparams.batch_size,
+                drop_last=True,
+                shuffle=True,
+            ),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=True,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=collate_fn,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            dataset=self.data_val,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.data_val,
+                batch_size=self.hparams.batch_size,
+                drop_last=False,
+                shuffle=False,
+            ),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=False,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=collate_fn,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            dataset=self.data_test,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.data_test,
+                batch_size=self.hparams.batch_size,
+                drop_last=False,
+                shuffle=False,
+            ),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=False,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=collate_fn,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def predict_dataloader(self):
+        return DataLoader(
+            dataset=self.data_predict,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.data_predict,
+                batch_size=self.hparams.batch_size,
+                drop_last=False,
+                shuffle=False,
+            ),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=False,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=collate_fn,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def teardown(self, stage: Optional[str] = None):
+        """Clean up after fit or test."""
+        pass
+    def state_dict(self):
+        """Extra things to save to checkpoint."""
+        return {}
+    def load_state_dict(self, state_dict: Dict[str, Any]):
+        """Things to do when loading checkpoint."""
+        pass

deepscreen/data/entity_datamodule.py ADDED Viewed

	@@ -0,0 +1,167 @@

+from numbers import Number
+from pathlib import Path
+from typing import Any, Dict, Optional, Sequence, Type
+from lightning import LightningDataModule
+from sklearn.base import TransformerMixin
+from torch.utils.data import Dataset, DataLoader
+from deepscreen.data.utils import collate_fn, SafeBatchSampler
+from deepscreen.data.utils.dataset import BaseEntityDataset
+class EntityDataModule(LightningDataModule):
+    """
+    def prepare_data(self):
+        # things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
+        # download data, pre-process, split, save to disk, etc.
+    def setup(self, stage):
+        # things to do on every process in DDP
+        # load data, set variables, etc.
+    def train_dataloader(self):
+        # return train dataloader
+    def val_dataloader(self):
+        # return validation dataloader
+    def test_dataloader(self):
+        # return test dataloader
+    def teardown(self):
+        # called on every process in DDP
+        # clean up after fit or test
+    """
+    def __init__(
+            self,
+            dataset: type[BaseEntityDataset],
+            transformer: type[TransformerMixin],
+            train: bool,
+            batch_size: int,
+            data_dir: str = "data/",
+            data_file: Optional[str] = None,
+            train_val_test_split: Optional[Sequence[Number], Sequence[str]] = None,
+            split: Optional[callable] = None,
+            num_workers: int = 0,
+            pin_memory: bool = False,
+    ):
+        super().__init__()
+        # data processing
+        self.split = split
+        if train:
+            if all([data_file, split]):
+                if all(isinstance(split, Number) for split in train_val_test_split):
+                    pass
+                else:
+                    raise ValueError('`train_val_test_split` must be a sequence of 3 numbers '
+                                     '(float for percentages and int for sample numbers) if '
+                                     '`data_file` and `split` have been specified.')
+            elif all(isinstance(split, str) for split in train_val_test_split) and not any([data_file, split]):
+                self.train_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[0]))
+                self.val_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[1]))
+                self.test_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[2]))
+            else:
+                raise ValueError('For training (train=True), you must specify either '
+                                 '`dataset_name` and `split` with `train_val_test_split` of 3 numbers or '
+                                 'solely `train_val_test_split` of 3 data file names.')
+        else:
+            if data_file and not any([split, train_val_test_split]):
+                self.test_data = self.predict_data = dataset(dataset_path=str(Path(data_dir) / data_file))
+            else:
+                raise ValueError("For testing/predicting (train=False), you must specify only `data_file` without "
+                                 "`train_val_test_split` or `split`")
+        # this line allows to access init params with 'self.hparams' attribute
+        # also ensures init params will be stored in ckpt
+        self.save_hyperparameters(logger=False)
+    def prepare_data(self):
+        """
+        Download data if needed.
+        Do not use it to assign state (e.g., self.x = x).
+        """
+    def setup(self, stage: Optional[str] = None, encoding: str = None):
+        """
+        Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
+        This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
+        careful not to execute data splitting twice.
+        """
+        # TODO test SafeBatchSampler (which skips samples with any None without introducing variable batch size)
+        # TODO: find a way to apply transformer.fit_transform only to train and transformer.transform only to val, test
+        # load and split datasets only if not loaded in initialization
+        if not any([self.train_data, self.test_data, self.val_data, self.predict_data]):
+            self.train_data, self.val_data, self.test_data = self.split(
+                dataset=self.hparams.dataset(data_dir=self.hparams.data_dir,
+                                             dataset_name=self.hparams.train_dataset_name),
+                lengths=self.hparams.train_val_test_split
+            )
+    def train_dataloader(self):
+        return DataLoader(
+            dataset=self.train_data,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.train_data,
+                batch_size=self.hparams.batch_size,
+                shuffle=True),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=True,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=collate_fn,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            dataset=self.val_data,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.val_data,
+                batch_size=self.hparams.batch_size,
+                shuffle=False),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=False,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=collate_fn,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            dataset=self.test_data,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.test_data,
+                batch_size=self.hparams.batch_size,
+                shuffle=False),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=False,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=collate_fn,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def predict_dataloader(self):
+        return DataLoader(
+            dataset=self.predict_data,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.predict_data,
+                batch_size=self.hparams.batch_size,
+                shuffle=False),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=False,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=collate_fn,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def teardown(self, stage: Optional[str] = None):
+        """Clean up after fit or test."""
+        pass
+    def state_dict(self):
+        """Extra things to save to checkpoint."""
+        return {}
+    def load_state_dict(self, state_dict: Dict[str, Any]):
+        """Things to do when loading checkpoint."""
+        pass

deepscreen/data/featurizers/__init__.py ADDED Viewed

File without changes

deepscreen/data/featurizers/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (191 Bytes). View file

deepscreen/data/featurizers/__pycache__/categorical.cpython-311.pyc ADDED Viewed

Binary file (5.6 kB). View file

deepscreen/data/featurizers/__pycache__/token.cpython-311.pyc ADDED Viewed

Binary file (14.9 kB). View file

deepscreen/data/featurizers/categorical.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import numpy as np
+# Sets of KNOWN characters in SMILES and FASTA sequences
+# Use list instead of set to preserve character order
+SMILES_VOCAB = ('#', '%', ')', '(', '+', '-', '.', '1', '0', '3', '2', '5', '4',
+                '7', '6', '9', '8', '=', 'A', 'C', 'B', 'E', 'D', 'G', 'F', 'I',
+                'H', 'K', 'M', 'L', 'O', 'N', 'P', 'S', 'R', 'U', 'T', 'W', 'V',
+                'Y', '[', 'Z', ']', '_', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i',
+                'h', 'm', 'l', 'o', 'n', 's', 'r', 'u', 't', 'y')
+FASTA_VOCAB = ('A', 'C', 'B', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'O',
+               'N', 'Q', 'P', 'S', 'R', 'U', 'T', 'W', 'V', 'Y', 'X', 'Z')
+# Check uniqueness, create character-index dicts, and add '?' for unknown characters as index 0
+assert len(SMILES_VOCAB) == len(set(SMILES_VOCAB)), 'SMILES_CHARSET has duplicate characters.'
+SMILES_CHARSET_IDX = {character: index+1 for index, character in enumerate(SMILES_VOCAB)} | {'?': 0}
+assert len(FASTA_VOCAB) == len(set(FASTA_VOCAB)), 'FASTA_CHARSET has duplicate characters.'
+FASTA_CHARSET_IDX = {character: index+1 for index, character in enumerate(FASTA_VOCAB)} | {'?': 0}
+def sequence_to_onehot(sequence: str, charset, max_sequence_length: int):
+    assert len(charset) == len(set(charset)), '`charset` contains duplicate characters.'
+    charset_idx = {character: index+1 for index, character in enumerate(charset)} | {'?': 0}
+    onehot = np.zeros((max_sequence_length, len(charset_idx)), dtype=int)
+    for index, character in enumerate(sequence[:max_sequence_length]):
+        onehot[index, charset_idx.get(character, 0)] = 1
+    return onehot.transpose()
+def sequence_to_label(sequence: str, charset, max_sequence_length: int):
+    assert len(charset) == len(set(charset)), '`charset` contains duplicate characters.'
+    charset_idx = {character: index+1 for index, character in enumerate(charset)} | {'?': 0}
+    label = np.zeros(max_sequence_length, dtype=int)
+    for index, character in enumerate(sequence[:max_sequence_length]):
+        label[index] = charset_idx.get(character, 0)
+    return label
+def smiles_to_onehot(smiles: str, smiles_charset=SMILES_VOCAB, max_sequence_length: int = 100):  # , in_channels: int = len(SMILES_CHARSET)
+    # assert len(SMILES_CHARSET) == len(set(SMILES_CHARSET)), 'SMILES_CHARSET has duplicate characters.'
+    # onehot = np.zeros((max_sequence_length, len(SMILES_CHARSET_IDX)))
+    # for index, character in enumerate(smiles[:max_sequence_length]):
+    #     onehot[index, SMILES_CHARSET_IDX.get(character, 0)] = 1
+    # return onehot.transpose()
+    return sequence_to_onehot(smiles, smiles_charset, max_sequence_length)
+def smiles_to_label(smiles: str, smiles_charset=SMILES_VOCAB, max_sequence_length: int = 100):  # , in_channels: int = len(SMILES_CHARSET)
+    # label = np.zeros(max_sequence_length)
+    # for index, character in enumerate(smiles[:max_sequence_length]):
+    #     label[index] = SMILES_CHARSET_IDX.get(character, 0)
+    # return label
+    return sequence_to_label(smiles, smiles_charset, max_sequence_length)
+def fasta_to_onehot(fasta: str, fasta_charset=FASTA_VOCAB, max_sequence_length: int = 1000):  # in_channels: int = len(FASTA_CHARSET)
+    # onehot = np.zeros((max_sequence_length, len(FASTA_CHARSET_IDX)))
+    # for index, character in enumerate(fasta[:max_sequence_length]):
+    #     onehot[index, FASTA_CHARSET_IDX.get(character, 0)] = 1
+    # return onehot.transpose()
+    return sequence_to_onehot(fasta, fasta_charset, max_sequence_length)
+def fasta_to_label(fasta: str, fasta_charset=FASTA_VOCAB, max_sequence_length: int = 1000):  # in_channels: int = len(FASTA_CHARSET)
+    # label = np.zeros(max_sequence_length)
+    # for index, character in enumerate(fasta[:max_sequence_length]):
+    #     label[index] = FASTA_CHARSET_IDX.get(character, 0)
+    # return label
+    return sequence_to_label(fasta, fasta_charset, max_sequence_length)
+def one_of_k_encoding(x, allowable_set):
+    if x not in allowable_set:
+        raise Exception("input {0} not in allowable set{1}:".format(x, allowable_set))
+    return list(map(lambda s: x == s, allowable_set))
+def one_of_k_encoding_unk(x, allowable_set):
+    """Maps inputs not in the allowable set to the last element."""
+    if x not in allowable_set:
+        x = allowable_set[-1]
+    return list(map(lambda s: x == s, allowable_set))

deepscreen/data/featurizers/chem.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""
+Mainly adapted from MolMap:
+https://github.com/shenwanxiang/bidd-molmap/tree/master/molmap/feature/fingerprint
+"""
+import numpy as np
+from rdkit import Chem, DataStructs
+from rdkit.Chem import AllChem
+from rdkit.Chem.Fingerprints import FingerprintMols
+from rdkit.Chem.rdReducedGraphs import GetErGFingerprint
+from deepscreen import get_logger
+log = get_logger(__name__)
+def smiles_to_erg(smiles):
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+        features = np.array(GetErGFingerprint(mol), dtype=bool)
+        return features
+    except Exception as e:
+        log.warning(f"Failed to convert SMILES ({smiles}) to ErGFP due to {str(e)}")
+        return None
+def smiles_to_morgan(smiles, radius=2, n_bits=1024):
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+        features_vec = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=n_bits)
+        features = np.zeros((1,))
+        DataStructs.ConvertToNumpyArray(features_vec, features)
+    except Exception as e:
+        log.warning(f"Failed to convert SMILES ({smiles}) to ErGFP due to {str(e)}")
+        return None
+def smiles_to_daylight(smiles):
+    try:
+        NumFinger = 2048
+        mol = Chem.MolFromSmiles(smiles)
+        bv = FingerprintMols.FingerprintMol(mol)
+        temp = tuple(bv.GetOnBits())
+        features = np.zeros((NumFinger,))
+        features[np.array(temp)] = 1
+    except:
+        print(f'RDKit could not find this SMILES: {smiles} convert to all 0 features')
+        features = np.zeros((2048,))
+    return features.astype(int)

deepscreen/data/featurizers/fcs.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from importlib import resources
+import numpy as np
+import pandas as pd
+from subword_nmt.apply_bpe import BPE
+import codecs
+vocab_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/protein_codes_uniprot.txt')
+bpe_codes_protein = codecs.open(vocab_path)
+protein_bpe = BPE(bpe_codes_protein, merges=-1, separator='')
+sub_csv_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/subword_units_map_uniprot.csv')
+sub_csv = pd.read_csv(sub_csv_path)
+idx2word_protein = sub_csv['index'].values
+words2idx_protein = dict(zip(idx2word_protein, range(0, len(idx2word_protein))))
+vocab_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/drug_codes_chembl.txt')
+bpe_codes_drug = codecs.open(vocab_path)
+drug_bpe = BPE(bpe_codes_drug, merges=-1, separator='')
+sub_csv_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/subword_units_map_chembl.csv')
+sub_csv = pd.read_csv(sub_csv_path)
+idx2word_drug = sub_csv['index'].values
+words2idx_drug = dict(zip(idx2word_drug, range(0, len(idx2word_drug))))
+def protein_to_embedding(x, max_sequence_length):
+    max_p = max_sequence_length
+    t1 = protein_bpe.process_line(x).split()  # split
+    try:
+        i1 = np.asarray([words2idx_protein[i] for i in t1])  # index
+    except:
+        i1 = np.array([0])
+        # print(x)
+    l = len(i1)
+    if l < max_p:
+        i = np.pad(i1, (0, max_p - l), 'constant', constant_values=0)
+        input_mask = ([1] * l) + ([0] * (max_p - l))
+    else:
+        i = i1[:max_p]
+        input_mask = [1] * max_p
+    return i, np.asarray(input_mask)
+def drug_to_embedding(x, max_sequence_length):
+    max_d = max_sequence_length
+    t1 = drug_bpe.process_line(x).split()  # split
+    try:
+        i1 = np.asarray([words2idx_drug[i] for i in t1])  # index
+    except:
+        i1 = np.array([0])
+        # print(x)
+    l = len(i1)
+    if l < max_d:
+        i = np.pad(i1, (0, max_d - l), 'constant', constant_values=0)
+        input_mask = ([1] * l) + ([0] * (max_d - l))
+    else:
+        i = i1[:max_d]
+        input_mask = [1] * max_d
+    return i, np.asarray(input_mask)

deepscreen/data/featurizers/fingerprint/__init__.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from typing import Literal
+from .atompairs import GetAtomPairFPs
+from .avalonfp import GetAvalonFPs
+from .rdkitfp import GetRDkitFPs
+from .morganfp import GetMorganFPs
+from .estatefp import GetEstateFPs
+from .maccskeys import GetMACCSFPs
+from .pharmErGfp import GetPharmacoErGFPs
+from .pharmPointfp import GetPharmacoPFPs
+from .pubchemfp import GetPubChemFPs
+from .torsions import GetTorsionFPs
+from .mhfp6 import GetMHFP6
+# from .map4 import GetMAP4
+from rdkit import Chem
+from deepscreen import get_logger
+log = get_logger(__name__)
+FP_MAP = {
+    'MorganFP': GetMorganFPs,
+    'RDkitFP': GetRDkitFPs,
+    'AtomPairFP': GetAtomPairFPs,
+    'TorsionFP': GetTorsionFPs,
+    'AvalonFP': GetAvalonFPs,
+    'EstateFP': GetEstateFPs,
+    'MACCSFP': GetMACCSFPs,
+    'PharmacoErGFP': GetPharmacoErGFPs,
+    'PharmacoPFP': GetPharmacoPFPs,
+    'PubChemFP': GetPubChemFPs,
+    'MHFP6': GetMHFP6,
+    # 'MAP4': GetMAP4,
+}
+def smiles_to_fingerprint(smiles, fingerprint: Literal[tuple(FP_MAP.keys())], **kwargs):
+    func = FP_MAP[fingerprint]
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+        arr = func(mol, **kwargs)
+        return arr
+    except Exception as e:
+        log.warning(f"Failed to convert SMILES ({smiles}) to {fingerprint} due to {str(e)}")
+        return None

deepscreen/data/featurizers/fingerprint/atompairs.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from rdkit.Chem.AtomPairs import Pairs
+from rdkit.Chem import DataStructs
+import numpy as np
+_type = 'topological-based'
+def GetAtomPairFPs(mol, nBits=2048, binary=True):
+    '''
+    atompairs fingerprints
+    '''
+    fp = Pairs.GetHashedAtomPairFingerprint(mol, nBits=nBits)
+    if binary:
+        arr = np.zeros((0,), dtype=np.bool_)
+    else:
+        arr = np.zeros((0,), dtype=np.int8)
+    DataStructs.ConvertToNumpyArray(fp, arr)
+    return arr

deepscreen/data/featurizers/fingerprint/avalonfp.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from rdkit.Chem import DataStructs
+from rdkit.Avalon.pyAvalonTools import GetAvalonFP as GAFP
+import numpy as np
+_type = 'topological-based'
+def GetAvalonFPs(mol, nBits=2048):
+    '''
+    Avalon_fingerprints: https://pubs.acs.org/doi/pdf/10.1021/ci050413p
+    '''
+    fp = GAFP(mol, nBits=nBits)
+    arr = np.zeros((0,), dtype=np.bool_)
+    DataStructs.ConvertToNumpyArray(fp, arr)
+    return arr

deepscreen/data/featurizers/fingerprint/estatefp.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from rdkit.Chem.EState import Fingerprinter
+import numpy as np
+_type = 'Estate-based'
+def GetEstateFPs(mol):
+    '''
+    79 bits Estate fps
+    '''
+    x = Fingerprinter.FingerprintMol(mol)[0]
+    return x.astype(np.bool_)

deepscreen/data/featurizers/fingerprint/maccskeys.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from rdkit.Chem import AllChem
+from rdkit.Chem import DataStructs
+import numpy as np
+import pandas as pd
+import os
+_type = 'SMARTS-based'
+file_path = os.path.dirname(__file__)
+def GetMACCSFPs(mol):
+    '''
+    166 bits
+    '''
+    fp = AllChem.GetMACCSKeysFingerprint(mol)
+    arr = np.zeros((0,), dtype=np.bool_)
+    DataStructs.ConvertToNumpyArray(fp, arr)
+    return arr
+def GetMACCSFPInfos():
+    return pd.read_excel(os.path.join(file_path, 'maccskeys.xlsx'))

deepscreen/data/featurizers/fingerprint/maccskeys.xlsx ADDED Viewed

Binary file (14 kB). View file

deepscreen/data/featurizers/fingerprint/map4.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""
+MinHashed Atom-pair Fingerprint, MAP
+orignal paper: Capecchi, Alice, Daniel Probst, and Jean-Louis Reymond. "One molecular fingerprint to rule them all: drugs, biomolecules, and the metabolome." Journal of Cheminformatics 12.1 (2020): 1-15. orignal code: https://github.com/reymond-group/map4, thanks their orignal work
+A small bug is fixed: https://github.com/reymond-group/map4/issues/6
+"""
+_type = 'topological-based'
+import itertools
+from collections import defaultdict
+import tmap as tm
+from mhfp.encoder import MHFPEncoder
+from rdkit import Chem
+from rdkit.Chem import rdmolops
+from rdkit.Chem.rdmolops import GetDistanceMatrix
+def to_smiles(mol):
+    return Chem.MolToSmiles(mol, canonical=True, isomericSmiles=False)
+class MAP4Calculator:
+    def __init__(self, dimensions=2048, radius=2, is_counted=False, is_folded=False, fold_dimensions=2048):
+        """
+        MAP4 calculator class
+        """
+        self.dimensions = dimensions
+        self.radius = radius
+        self.is_counted = is_counted
+        self.is_folded = is_folded
+        self.fold_dimensions = fold_dimensions
+        if self.is_folded:
+            self.encoder = MHFPEncoder(dimensions)
+        else:
+            self.encoder = tm.Minhash(dimensions)
+    def calculate(self, mol):
+        """Calculates the atom pair minhashed fingerprint
+        Arguments:
+            mol -- rdkit mol object
+        Returns:
+            tmap VectorUint -- minhashed fingerprint
+        """
+        atom_env_pairs = self._calculate(mol)
+        if self.is_folded:
+            return self._fold(atom_env_pairs)
+        return self.encoder.from_string_array(atom_env_pairs)
+    def calculate_many(self, mols):
+        """ Calculates the atom pair minhashed fingerprint
+        Arguments:
+            mols -- list of mols
+        Returns:
+            list of tmap VectorUint -- minhashed fingerprints list
+        """
+        atom_env_pairs_list = [self._calculate(mol) for mol in mols]
+        if self.is_folded:
+            return [self._fold(pairs) for pairs in atom_env_pairs_list]
+        return self.encoder.batch_from_string_array(atom_env_pairs_list)
+    def _calculate(self, mol):
+        return self._all_pairs(mol, self._get_atom_envs(mol))
+    def _fold(self, pairs):
+        fp_hash = self.encoder.hash(set(pairs))
+        return self.encoder.fold(fp_hash, self.fold_dimensions)
+    def _get_atom_envs(self, mol):
+        atoms_env = {}
+        for atom in mol.GetAtoms():
+            idx = atom.GetIdx()
+            for radius in range(1, self.radius + 1):
+                if idx not in atoms_env:
+                    atoms_env[idx] = []
+                atoms_env[idx].append(MAP4Calculator._find_env(mol, idx, radius))
+        return atoms_env
+    @classmethod
+    def _find_env(cls, mol, idx, radius):
+        env = rdmolops.FindAtomEnvironmentOfRadiusN(mol, radius, idx)
+        atom_map = {}
+        submol = Chem.PathToSubmol(mol, env, atomMap=atom_map)
+        if idx in atom_map:
+            smiles = Chem.MolToSmiles(submol, rootedAtAtom=atom_map[idx], canonical=True, isomericSmiles=False)
+            return smiles
+        return ''
+    def _all_pairs(self, mol, atoms_env):
+        atom_pairs = []
+        distance_matrix = GetDistanceMatrix(mol)
+        num_atoms = mol.GetNumAtoms()
+        shingle_dict = defaultdict(int)
+        for idx1, idx2 in itertools.combinations(range(num_atoms), 2):
+            dist = str(int(distance_matrix[idx1][idx2]))
+            for i in range(self.radius):
+                env_a = atoms_env[idx1][i]
+                env_b = atoms_env[idx2][i]
+                ordered = sorted([env_a, env_b])
+                shingle = '{}|{}|{}'.format(ordered[0], dist, ordered[1])
+                if self.is_counted:
+                    shingle_dict[shingle] += 1
+                    shingle += '|' + str(shingle_dict[shingle])
+                atom_pairs.append(shingle.encode('utf-8'))
+        return list(set(atom_pairs))
+def GetMAP4(mol, nBits=2048, radius=2, fold_dimensions=None):
+    """
+    MAP4: radius=2
+    """
+    if fold_dimensions == None:
+        fold_dimensions = nBits
+    calc = MAP4Calculator(dimensions=nBits, radius=radius, is_counted=False, is_folded=True,
+                          fold_dimensions=fold_dimensions)
+    arr = calc.calculate(mol)
+    return arr.astype(bool)

deepscreen/data/featurizers/fingerprint/mhfp6.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""
+Probst, Daniel, and Jean-Louis Reymond. "A probabilistic molecular fingerprint for big data settings." Journal of cheminformatics 10.1 (2018): 66.'
+orignal code: https://github.com/reymond-group/mhfp
+"""
+from mhfp.encoder import MHFPEncoder
+def GetMHFP6(mol, nBits=2048, radius=3):
+    """
+    MHFP6: radius=3
+    """
+    encoder = MHFPEncoder(n_permutations=nBits)
+    hash_values = encoder.encode_mol(mol, radius=radius, rings=True, kekulize=True, min_radius=1)
+    arr = encoder.fold(hash_values, nBits)
+    return arr.astype(bool)

deepscreen/data/featurizers/fingerprint/mnimalfatures.fdef ADDED Viewed

	@@ -0,0 +1,53 @@

+AtomType NDonor [N&!H0&v3,N&!H0&+1&v4,n&H1&+0]
+AtomType ChalcDonor [O,S;H1;+0]
+DefineFeature SingleAtomDonor [{NDonor},{ChalcDonor},!$([D1]-[C;D3]=[O,S,N])]
+  Family Donor
+  Weights 1
+EndFeature
+AtomType NAcceptor [$([N&v3;H1,H2]-[!$(*=[O,N,P,S])])]
+Atomtype NAcceptor [$([N;v3;H0])]
+AtomType NAcceptor [$([n;+0])]
+AtomType ChalcAcceptor [$([O,S;H1;v2]-[!$(*=[O,N,P,S])])]
+AtomType ChalcAcceptor [O,S;H0;v2]
+Atomtype ChalcAcceptor [O,S;-]
+Atomtype ChalcAcceptor [o,s;+0]
+AtomType HalogenAcceptor [F]
+DefineFeature SingleAtomAcceptor [{NAcceptor},{ChalcAcceptor},{HalogenAcceptor}]
+  Family Acceptor
+  Weights 1
+EndFeature
+# this one is delightfully easy:
+DefineFeature AcidicGroup [C,S](=[O,S,P])-[O;H1,H0&-1]
+  Family NegIonizable
+  Weights 1.0,1.0,1.0
+EndFeature
+AtomType CarbonOrArom_NonCarbonyl [$([C,a]);!$([C,a](=O))]
+AtomType BasicNH2 [$([N;H2&+0][{CarbonOrArom_NonCarbonyl}])]
+AtomType BasicNH1 [$([N;H1&+0]([{CarbonOrArom_NonCarbonyl}])[{CarbonOrArom_NonCarbonyl}])]
+AtomType BasicNH0 [$([N;H0&+0]([{CarbonOrArom_NonCarbonyl}])([{CarbonOrArom_NonCarbonyl}])[{CarbonOrArom_NonCarbonyl}])]
+AtomType BasicNakedN [N,n;X2;+0]
+DefineFeature BasicGroup [{BasicNH2},{BasicNH1},{BasicNH0},{BasicNakedN}]
+  Family PosIonizable
+  Weights 1.0
+EndFeature
+# aromatic rings of various sizes:
+DefineFeature Arom5 a1aaaa1
+  Family Aromatic
+  Weights 1.0,1.0,1.0,1.0,1.0
+EndFeature
+DefineFeature Arom6 a1aaaaa1
+  Family Aromatic
+  Weights 1.0,1.0,1.0,1.0,1.0,1.0
+EndFeature
+DefineFeature Arom7 a1aaaaaa1
+  Family Aromatic
+  Weights 1.0,1.0,1.0,1.0,1.0,1.0,1.0
+EndFeature
+DefineFeature Arom8 a1aaaaaaa1
+  Family Aromatic
+  Weights 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
+EndFeature

deepscreen/data/featurizers/fingerprint/morganfp.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from rdkit.Chem import AllChem
+from rdkit.Chem import DataStructs
+import numpy as np
+def GetMorganFPs(mol, nBits=2048, radius=2, return_bitInfo=False):
+    """
+    ECFP4: radius=2
+    """
+    bitInfo = {}
+    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius,
+                                               bitInfo=bitInfo, nBits=nBits)
+    arr = np.zeros((0,), dtype=np.bool_)
+    DataStructs.ConvertToNumpyArray(fp, arr)
+    if return_bitInfo:
+        return arr, bitInfo
+    return arr

deepscreen/data/featurizers/fingerprint/pharmErGfp.py ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Aug 17 16:54:12 2019
+@author: wanxiang.shen@u.nus.edu
+@calculate ErG fps, more info: https://pubs.acs.org/doi/full/10.1021/ci050457y#
+"""
+_type = 'Pharmacophore-based'
+import numpy as np
+from rdkit.Chem import AllChem
+## get info from : https://github.com/rdkit/rdkit/blob/d41752d558bf7200ab67b98cdd9e37f1bdd378de/Code/GraphMol/ReducedGraphs/ReducedGraphs.cpp
+Donor = ["[N;!H0;v3,v4&+1]", "[O,S;H1;+0]", "[n&H1&+0]"]
+Acceptor = ["[O,S;H1;v2;!$(*-*=[O,N,P,S])]", "[O;H0;v2]", "[O,S;v1;-]",
+            "[N;v3;!$(N-*=[O,N,P,S])]", "[n&H0&+0]", "[o;+0;!$([o]:n);!$([o]:c:n)]"]
+Positive = ["[#7;+]", "[N;H2&+0][$([C,a]);!$([C,a](=O))]",
+            "[N;H1&+0]([$([C,a]);!$([C,a](=O))])[$([C,a]);!$([C,a](=O))]",
+            "[N;H0&+0]([C;!$(C(=O))])([C;!$(C(=O))])[C;!$(C(=O))]"]
+Negative = ["[C,S](=[O,S,P])-[O;H1,-1]"]
+Hydrophobic = ["[C;D3,D4](-[CH3])-[CH3]", "[S;D2](-C)-C"]
+Aromatic = ["a"]
+PROPERTY_KEY = ["Donor", "Acceptor", "Positive", "Negative", "Hydrophobic", "Aromatic"]
+def GetPharmacoErGFPs(mol, fuzzIncrement=0.3, maxPath=21, binary=True, return_bitInfo=False):
+    '''
+    https://pubs.acs.org/doi/full/10.1021/ci050457y#
+    return maxPath*21 bits
+    size(v) = (n(n + 1)/2) * (maxDist - minDist + 1)
+    '''
+    minPath = 1
+    arr = AllChem.GetErGFingerprint(mol, fuzzIncrement=fuzzIncrement, maxPath=maxPath, minPath=minPath)
+    arr = arr.astype(np.float32)
+    if binary:
+        arr = arr.astype(np.bool_)
+    if return_bitInfo:
+        bitInfo = []
+        for i in range(len(PROPERTY_KEY)):
+            for j in range(i, len(PROPERTY_KEY)):
+                for path in range(minPath, maxPath + 1):
+                    triplet = (PROPERTY_KEY[i], PROPERTY_KEY[j], path)
+                    bitInfo.append(triplet)
+        return arr, bitInfo
+    return arr

deepscreen/data/featurizers/fingerprint/pharmPointfp.py ADDED Viewed

	@@ -0,0 +1,59 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Aug 17 16:54:12 2019
+@author: wanxiang.shen@u.nus.edu
+Combining a set of chemical features with the 2D (topological) distances between them gives a 2D pharmacophore. When the distances are binned, unique integer ids can be assigned to each of these pharmacophores and they can be stored in a fingerprint. Details of the encoding are in: https://www.rdkit.org/docs/RDKit_Book.html#ph4-figure
+"""
+_type = 'Pharmacophore-based'
+from rdkit.Chem.Pharm2D.SigFactory import SigFactory
+from rdkit.Chem.Pharm2D import Generate
+from rdkit.Chem import DataStructs
+from rdkit.Chem import ChemicalFeatures
+import numpy as np
+import os
+fdef = os.path.join(os.path.dirname(__file__), 'mnimalfatures.fdef')
+featFactory = ChemicalFeatures.BuildFeatureFactory(fdef)
+def GetPharmacoPFPs(mol,
+                    bins=[(i, i + 1) for i in range(20)],
+                    minPointCount=2,
+                    maxPointCount=2,
+                    return_bitInfo=False):
+    '''
+    Note: maxPointCont with 3 is slowly
+    bins = [(i,i+1) for i in range(20)],
+    maxPonitCount=2 for large-scale computation
+    '''
+    MysigFactory = SigFactory(featFactory,
+                              trianglePruneBins=False,
+                              minPointCount=minPointCount,
+                              maxPointCount=maxPointCount)
+    MysigFactory.SetBins(bins)
+    MysigFactory.Init()
+    res = Generate.Gen2DFingerprint(mol, MysigFactory)
+    arr = np.array(list(res)).astype(np.bool_)
+    if return_bitInfo:
+        description = []
+        for i in range(len(res)):
+            description.append(MysigFactory.GetBitDescription(i))
+        return arr, description
+    return arr
+if __name__ == '__main__':
+    from rdkit import Chem
+    mol = Chem.MolFromSmiles('CC#CC(=O)NC1=NC=C2C(=C1)C(=NC=N2)NC3=CC(=C(C=C3)F)Cl')
+    a = GetPharmacoPFPs(mol, bins=[(i, i + 1) for i in range(20)], minPointCount=2, maxPointCount=2)

deepscreen/data/featurizers/fingerprint/pubchemfp.py ADDED Viewed

	@@ -0,0 +1,1731 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Aug 25 20:29:36 2019
+@author: charleshen
+@Note: The code are copyed from PyBioMed, with a minor repair
+https://www.ncbi.nlm.nih.gov/pubmed/29556758
+these are SMARTS patterns corresponding to the PubChem fingerprints
+https://astro.temple.edu/~tua87106/list_fingerprints.pdf
+ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.txt
+"""
+_type = 'SMARTS-based'
+import numpy as np
+from rdkit import Chem
+from rdkit import DataStructs
+import os
+import pandas as pd
+smartsPatts = {
+    1: ('[H]', 3),  # 1-115
+    2: ('[H]', 7),
+    3: ('[H]', 15),
+    4: ('[H]', 31),
+    5: ('[Li]', 0),
+    6: ('[Li]', 1),
+    7: ('[B]', 0),
+    8: ('[B]', 1),
+    9: ('[B]', 3),
+    10: ('[C]', 1),
+    11: ('[C]', 3),
+    12: ('[C]', 7),
+    13: ('[C]', 15),
+    14: ('[C]', 31),
+    15: ('[N]', 0),
+    16: ('[N]', 1),
+    17: ('[N]', 3),
+    18: ('[N]', 7),
+    19: ('[O]', 0),
+    20: ('[O]', 1),
+    21: ('[O]', 3),
+    22: ('[O]', 7),
+    23: ('[O]', 15),
+    24: ('[F]', 0),
+    25: ('[F]', 1),
+    26: ('[F]', 3),
+    27: ('[Na]', 0),
+    28: ('[Na]', 1),
+    29: ('[Si]', 0),
+    30: ('[Si]', 1),
+    31: ('[P]', 0),
+    32: ('[P]', 1),
+    33: ('[P]', 3),
+    34: ('[S]', 0),
+    35: ('[S]', 1),
+    36: ('[S]', 3),
+    37: ('[S]', 7),
+    38: ('[Cl]', 0),
+    39: ('[Cl]', 1),
+    40: ('[Cl]', 3),
+    41: ('[Cl]', 7),
+    42: ('[K]', 0),
+    43: ('[K]', 1),
+    44: ('[Br]', 0),
+    45: ('[Br]', 1),
+    46: ('[Br]', 3),
+    47: ('[I]', 0),
+    48: ('[I]', 1),
+    49: ('[I]', 3),
+    50: ('[Be]', 0),
+    51: ('[Mg]', 0),
+    52: ('[Al]', 0),
+    53: ('[Ca]', 0),
+    54: ('[Sc]', 0),
+    55: ('[Ti]', 0),
+    56: ('[V]', 0),
+    57: ('[Cr]', 0),
+    58: ('[Mn]', 0),
+    59: ('[Fe]', 0),
+    60: ('[CO]', 0),
+    61: ('[Ni]', 0),
+    62: ('[Cu]', 0),
+    63: ('[Zn]', 0),
+    64: ('[Ga]', 0),
+    65: ('[Ge]', 0),
+    66: ('[As]', 0),
+    67: ('[Se]', 0),
+    68: ('[Kr]', 0),
+    69: ('[Rb]', 0),
+    70: ('[Sr]', 0),
+    71: ('[Y]', 0),
+    72: ('[Zr]', 0),
+    73: ('[Nb]', 0),
+    74: ('[Mo]', 0),
+    75: ('[Ru]', 0),
+    76: ('[Rh]', 0),
+    77: ('[Pd]', 0),
+    78: ('[Ag]', 0),
+    79: ('[Cd]', 0),
+    80: ('[In]', 0),
+    81: ('[Sn]', 0),
+    82: ('[Sb]', 0),
+    83: ('[Te]', 0),
+    84: ('[Xe]', 0),
+    85: ('[Cs]', 0),
+    86: ('[Ba]', 0),
+    87: ('[Lu]', 0),
+    88: ('[Hf]', 0),
+    89: ('[Ta]', 0),
+    90: ('[W]', 0),
+    91: ('[Re]', 0),
+    92: ('[Os]', 0),
+    93: ('[Ir]', 0),
+    94: ('[Pt]', 0),
+    95: ('[Au]', 0),
+    96: ('[Hg]', 0),
+    97: ('[Tl]', 0),
+    98: ('[Pb]', 0),
+    99: ('[Bi]', 0),
+    100: ('[La]', 0),
+    101: ('[Ce]', 0),
+    102: ('[Pr]', 0),
+    103: ('[Nd]', 0),
+    104: ('[Pm]', 0),
+    105: ('[Sm]', 0),
+    106: ('[Eu]', 0),
+    107: ('[Gd]', 0),
+    108: ('[Tb]', 0),
+    109: ('[Dy]', 0),
+    110: ('[Ho]', 0),
+    111: ('[Er]', 0),
+    112: ('[Tm]', 0),
+    113: ('[Yb]', 0),
+    114: ('[Tc]', 0),
+    115: ('[U]', 0),
+    116: ('[Li&!H0]', 0),  # 264-881
+    117: ('[Li]~[Li]', 0),
+    118: ('[Li]~[#5]', 0),
+    119: ('[Li]~[#6]', 0),
+    120: ('[Li]~[#8]', 0),
+    121: ('[Li]~[F]', 0),
+    122: ('[Li]~[#15]', 0),
+    123: ('[Li]~[#16]', 0),
+    124: ('[Li]~[Cl]', 0),
+    125: ('[#5&!H0]', 0),
+    126: ('[#5]~[#5]', 0),
+    127: ('[#5]~[#6]', 0),
+    128: ('[#5]~[#7]', 0),
+    129: ('[#5]~[#8]', 0),
+    130: ('[#5]~[F]', 0),
+    131: ('[#5]~[#14]', 0),
+    132: ('[#5]~[#15]', 0),
+    133: ('[#5]~[#16]', 0),
+    134: ('[#5]~[Cl]', 0),
+    135: ('[#5]~[Br]', 0),
+    136: ('[#6&!H0]', 0),
+    137: ('[#6]~[#6]', 0),
+    138: ('[#6]~[#7]', 0),
+    139: ('[#6]~[#8]', 0),
+    140: ('[#6]~[F]', 0),
+    141: ('[#6]~[Na]', 0),
+    142: ('[#6]~[Mg]', 0),
+    143: ('[#6]~[Al]', 0),
+    144: ('[#6]~[#14]', 0),
+    145: ('[#6]~[#15]', 0),
+    146: ('[#6]~[#16]', 0),
+    147: ('[#6]~[Cl]', 0),
+    148: ('[#6]~[#33]', 0),
+    149: ('[#6]~[#34]', 0),
+    150: ('[#6]~[Br]', 0),
+    151: ('[#6]~[I]', 0),
+    152: ('[#7&!H0]', 0),
+    153: ('[#7]~[#7]', 0),
+    154: ('[#7]~[#8]', 0),
+    155: ('[#7]~[F]', 0),
+    156: ('[#7]~[#14]', 0),
+    157: ('[#7]~[#15]', 0),
+    158: ('[#7]~[#16]', 0),
+    159: ('[#7]~[Cl]', 0),
+    160: ('[#7]~[Br]', 0),
+    161: ('[#8&!H0]', 0),
+    162: ('[#8]~[#8]', 0),
+    163: ('[#8]~[Mg]', 0),
+    164: ('[#8]~[Na]', 0),
+    165: ('[#8]~[Al]', 0),
+    166: ('[#8]~[#14]', 0),
+    167: ('[#8]~[#15]', 0),
+    168: ('[#8]~[K]', 0),
+    169: ('[F]~[#15]', 0),
+    170: ('[F]~[#16]', 0),
+    171: ('[Al&!H0]', 0),
+    172: ('[Al]~[Cl]', 0),
+    173: ('[#14&!H0]', 0),
+    174: ('[#14]~[#14]', 0),
+    175: ('[#14]~[Cl]', 0),
+    176: ('[#15&!H0]', 0),
+    177: ('[#15]~[#15]', 0),
+    178: ('[#33&!H0]', 0),
+    179: ('[#33]~[#33]', 0),
+    180: ('[#6](~Br)(~[#6])', 0),
+    181: ('[#6](~Br)(~[#6])(~[#6])', 0),
+    182: ('[#6&!H0]~[Br]', 0),
+    183: ('[#6](~[Br])(:[c])', 0),
+    184: ('[#6](~[Br])(:[n])', 0),
+    185: ('[#6](~[#6])(~[#6])', 0),
+    186: ('[#6](~[#6])(~[#6])(~[#6])', 0),
+    187: ('[#6](~[#6])(~[#6])(~[#6])(~[#6])', 0),
+    188: ('[#6H1](~[#6])(~[#6])(~[#6])', 0),
+    189: ('[#6](~[#6])(~[#6])(~[#6])(~[#7])', 0),
+    190: ('[#6](~[#6])(~[#6])(~[#6])(~[#8])', 0),
+    191: ('[#6H1](~[#6])(~[#6])(~[#7])', 0),
+    192: ('[#6H1](~[#6])(~[#6])(~[#8])', 0),
+    193: ('[#6](~[#6])(~[#6])(~[#7])', 0),
+    194: ('[#6](~[#6])(~[#6])(~[#8])', 0),
+    195: ('[#6](~[#6])(~[Cl])', 0),
+    196: ('[#6&!H0](~[#6])(~[Cl])', 0),
+    197: ('[#6H,#6H2,#6H3,#6H4]~[#6]', 0),
+    198: ('[#6&!H0](~[#6])(~[#7])', 0),
+    199: ('[#6&!H0](~[#6])(~[#8])', 0),
+    200: ('[#6H1](~[#6])(~[#8])(~[#8])', 0),
+    201: ('[#6&!H0](~[#6])(~[#15])', 0),
+    202: ('[#6&!H0](~[#6])(~[#16])', 0),
+    203: ('[#6](~[#6])(~[I])', 0),
+    204: ('[#6](~[#6])(~[#7])', 0),
+    205: ('[#6](~[#6])(~[#8])', 0),
+    206: ('[#6](~[#6])(~[#16])', 0),
+    207: ('[#6](~[#6])(~[#14])', 0),
+    208: ('[#6](~[#6])(:c)', 0),
+    209: ('[#6](~[#6])(:c)(:c)', 0),
+    210: ('[#6](~[#6])(:c)(:n)', 0),
+    211: ('[#6](~[#6])(:n)', 0),
+    212: ('[#6](~[#6])(:n)(:n)', 0),
+    213: ('[#6](~[Cl])(~[Cl])', 0),
+    214: ('[#6&!H0](~[Cl])', 0),
+    215: ('[#6](~[Cl])(:c)', 0),
+    216: ('[#6](~[F])(~[F])', 0),
+    217: ('[#6](~[F])(:c)', 0),
+    218: ('[#6&!H0](~[#7])', 0),
+    219: ('[#6&!H0](~[#8])', 0),
+    220: ('[#6&!H0](~[#8])(~[#8])', 0),
+    221: ('[#6&!H0](~[#16])', 0),
+    222: ('[#6&!H0](~[#14])', 0),
+    223: ('[#6&!H0]:c', 0),
+    224: ('[#6&!H0](:c)(:c)', 0),
+    225: ('[#6&!H0](:c)(:n)', 0),
+    226: ('[#6&!H0](:n)', 0),
+    227: ('[#6H3]', 0),
+    228: ('[#6](~[#7])(~[#7])', 0),
+    229: ('[#6](~[#7])(:c)', 0),
+    230: ('[#6](~[#7])(:c)(:c)', 0),
+    231: ('[#6](~[#7])(:c)(:n)', 0),
+    232: ('[#6](~[#7])(:n)', 0),
+    233: ('[#6](~[#8])(~[#8])', 0),
+    234: ('[#6](~[#8])(:c)', 0),
+    235: ('[#6](~[#8])(:c)(:c)', 0),
+    236: ('[#6](~[#16])(:c)', 0),
+    237: ('[#6](:c)(:c)', 0),
+    238: ('[#6](:c)(:c)(:c)', 0),
+    239: ('[#6](:c)(:c)(:n)', 0),
+    240: ('[#6](:c)(:n)', 0),
+    241: ('[#6](:c)(:n)(:n)', 0),
+    242: ('[#6](:n)(:n)', 0),
+    243: ('[#7](~[#6])(~[#6])', 0),
+    244: ('[#7](~[#6])(~[#6])(~[#6])', 0),
+    245: ('[#7&!H0](~[#6])(~[#6])', 0),
+    246: ('[#7&!H0](~[#6])', 0),
+    247: ('[#7&!H0](~[#6])(~[#7])', 0),
+    248: ('[#7](~[#6])(~[#8])', 0),
+    249: ('[#7](~[#6])(:c)', 0),
+    250: ('[#7](~[#6])(:c)(:c)', 0),
+    251: ('[#7&!H0](~[#7])', 0),
+    252: ('[#7&!H0](:c)', 0),
+    253: ('[#7&!H0](:c)(:c)', 0),
+    254: ('[#7](~[#8])(~[#8])', 0),
+    255: ('[#7](~[#8])(:o)', 0),
+    256: ('[#7](:c)(:c)', 0),
+    257: ('[#7](:c)(:c)(:c)', 0),
+    258: ('[#8](~[#6])(~[#6])', 0),
+    259: ('[#8&!H0](~[#6])', 0),
+    260: ('[#8](~[#6])(~[#15])', 0),
+    261: ('[#8&!H0](~[#16])', 0),
+    262: ('[#8](:c)(:c)', 0),
+    263: ('[#15](~[#6])(~[#6])', 0),
+    264: ('[#15](~[#8])(~[#8])', 0),
+    265: ('[#16](~[#6])(~[#6])', 0),
+    266: ('[#16&!H0](~[#6])', 0),
+    267: ('[#16](~[#6])(~[#8])', 0),
+    268: ('[#14](~[#6])(~[#6])', 0),
+    269: ('[#6]=,:[#6]', 0),
+    270: ('[#6]#[#6]', 0),
+    271: ('[#6]=,:[#7]', 0),
+    272: ('[#6]#[#7]', 0),
+    273: ('[#6]=,:[#8]', 0),
+    274: ('[#6]=,:[#16]', 0),
+    275: ('[#7]=,:[#7]', 0),
+    276: ('[#7]=,:[#8]', 0),
+    277: ('[#7]=,:[#15]', 0),
+    278: ('[#15]=,:[#8]', 0),
+    279: ('[#15]=,:[#15]', 0),
+    280: ('[#6](#[#6])(-,:[#6])', 0),
+    281: ('[#6&!H0](#[#6])', 0),
+    282: ('[#6](#[#7])(-,:[#6])', 0),
+    283: ('[#6](-,:[#6])(-,:[#6])(=,:[#6])', 0),
+    284: ('[#6](-,:[#6])(-,:[#6])(=,:[#7])', 0),
+    285: ('[#6](-,:[#6])(-,:[#6])(=,:[#8])', 0),
+    286: ('[#6](-,:[#6])([Cl])(=,:[#8])', 0),
+    287: ('[#6&!H0](-,:[#6])(=,:[#6])', 0),
+    288: ('[#6&!H0](-,:[#6])(=,:[#7])', 0),
+    289: ('[#6&!H0](-,:[#6])(=,:[#8])', 0),
+    290: ('[#6](-,:[#6])(-,:[#7])(=,:[#6])', 0),
+    291: ('[#6](-,:[#6])(-,:[#7])(=,:[#7])', 0),
+    292: ('[#6](-,:[#6])(-,:[#7])(=,:[#8])', 0),
+    293: ('[#6](-,:[#6])(-,:[#8])(=,:[#8])', 0),
+    294: ('[#6](-,:[#6])(=,:[#6])', 0),
+    295: ('[#6](-,:[#6])(=,:[#7])', 0),
+    296: ('[#6](-,:[#6])(=,:[#8])', 0),
+    297: ('[#6]([Cl])(=,:[#8])', 0),
+    298: ('[#6&!H0](-,:[#7])(=,:[#6])', 0),
+    299: ('[#6&!H0](=,:[#6])', 0),
+    300: ('[#6&!H0](=,:[#7])', 0),
+    301: ('[#6&!H0](=,:[#8])', 0),
+    302: ('[#6](-,:[#7])(=,:[#6])', 0),
+    303: ('[#6](-,:[#7])(=,:[#7])', 0),
+    304: ('[#6](-,:[#7])(=,:[#8])', 0),
+    305: ('[#6](-,:[#8])(=,:[#8])', 0),
+    306: ('[#7](-,:[#6])(=,:[#6])', 0),
+    307: ('[#7](-,:[#6])(=,:[#8])', 0),
+    308: ('[#7](-,:[#8])(=,:[#8])', 0),
+    309: ('[#15](-,:[#8])(=,:[#8])', 0),
+    310: ('[#16](-,:[#6])(=,:[#8])', 0),
+    311: ('[#16](-,:[#8])(=,:[#8])', 0),
+    312: ('[#16](=,:[#8])(=,:[#8])', 0),
+    313: ('[#6]-,:[#6]-,:[#6]#[#6]', 0),
+    314: ('[#8]-,:[#6]-,:[#6]=,:[#7]', 0),
+    315: ('[#8]-,:[#6]-,:[#6]=,:[#8]', 0),
+    316: ('[#7]:[#6]-,:[#16&!H0]', 0),
+    317: ('[#7]-,:[#6]-,:[#6]=,:[#6]', 0),
+    318: ('[#8]=,:[#16]-,:[#6]-,:[#6]', 0),
+    319: ('[#7]#[#6]-,:[#6]=,:[#6]', 0),
+    320: ('[#6]=,:[#7]-,:[#7]-,:[#6]', 0),
+    321: ('[#8]=,:[#16]-,:[#6]-,:[#7]', 0),
+    322: ('[#16]-,:[#16]-,:[#6]:[#6]', 0),
+    323: ('[#6]:[#6]-,:[#6]=,:[#6]', 0),
+    324: ('[#16]:[#6]:[#6]:[#6]', 0),
+    325: ('[#6]:[#7]:[#6]-,:[#6]', 0),
+    326: ('[#16]-,:[#6]:[#7]:[#6]', 0),
+    327: ('[#16]:[#6]:[#6]:[#7]', 0),
+    328: ('[#16]-,:[#6]=,:[#7]-,:[#6]', 0),
+    329: ('[#6]-,:[#8]-,:[#6]=,:[#6]', 0),
+    330: ('[#7]-,:[#7]-,:[#6]:[#6]', 0),
+    331: ('[#16]-,:[#6]=,:[#7&!H0]', 0),
+    332: ('[#16]-,:[#6]-,:[#16]-,:[#6]', 0),
+    333: ('[#6]:[#16]:[#6]-,:[#6]', 0),
+    334: ('[#8]-,:[#16]-,:[#6]:[#6]', 0),
+    335: ('[#6]:[#7]-,:[#6]:[#6]', 0),
+    336: ('[#7]-,:[#16]-,:[#6]:[#6]', 0),
+    337: ('[#7]-,:[#6]:[#7]:[#6]', 0),
+    338: ('[#7]:[#6]:[#6]:[#7]', 0),
+    339: ('[#7]-,:[#6]:[#7]:[#7]', 0),
+    340: ('[#7]-,:[#6]=,:[#7]-,:[#6]', 0),
+    341: ('[#7]-,:[#6]=,:[#7&!H0]', 0),
+    342: ('[#7]-,:[#6]-,:[#16]-,:[#6]', 0),
+    343: ('[#6]-,:[#6]-,:[#6]=,:[#6]', 0),
+    344: ('[#6]-,:[#7]:[#6&!H0]', 0),
+    345: ('[#7]-,:[#6]:[#8]:[#6]', 0),
+    346: ('[#8]=,:[#6]-,:[#6]:[#6]', 0),
+    347: ('[#8]=,:[#6]-,:[#6]:[#7]', 0),
+    348: ('[#6]-,:[#7]-,:[#6]:[#6]', 0),
+    349: ('[#7]:[#7]-,:[#6&!H0]', 0),
+    350: ('[#8]-,:[#6]:[#6]:[#7]', 0),
+    351: ('[#8]-,:[#6]=,:[#6]-,:[#6]', 0),
+    352: ('[#7]-,:[#6]:[#6]:[#7]', 0),
+    353: ('[#6]-,:[#16]-,:[#6]:[#6]', 0),
+    354: ('[Cl]-,:[#6]:[#6]-,:[#6]', 0),
+    355: ('[#7]-,:[#6]=,:[#6&!H0]', 0),
+    356: ('[Cl]-,:[#6]:[#6&!H0]', 0),
+    357: ('[#7]:[#6]:[#7]-,:[#6]', 0),
+    358: ('[Cl]-,:[#6]:[#6]-,:[#8]', 0),
+    359: ('[#6]-,:[#6]:[#7]:[#6]', 0),
+    360: ('[#6]-,:[#6]-,:[#16]-,:[#6]', 0),
+    361: ('[#16]=,:[#6]-,:[#7]-,:[#6]', 0),
+    362: ('[Br]-,:[#6]:[#6]-,:[#6]', 0),
+    363: ('[#7&!H0]-,:[#7&!H0]', 0),
+    364: ('[#16]=,:[#6]-,:[#7&!H0]', 0),
+    365: ('[#6]-,:[#33]-[#8&!H0]', 0),
+    366: ('[#16]:[#6]:[#6&!H0]', 0),
+    367: ('[#8]-,:[#7]-,:[#6]-,:[#6]', 0),
+    368: ('[#7]-,:[#7]-,:[#6]-,:[#6]', 0),
+    369: ('[#6H,#6H2,#6H3]=,:[#6H,#6H2,#6H3]', 0),
+    370: ('[#7]-,:[#7]-,:[#6]-,:[#7]', 0),
+    371: ('[#8]=,:[#6]-,:[#7]-,:[#7]', 0),
+    372: ('[#7]=,:[#6]-,:[#7]-,:[#6]', 0),
+    373: ('[#6]=,:[#6]-,:[#6]:[#6]', 0),
+    374: ('[#6]:[#7]-,:[#6&!H0]', 0),
+    375: ('[#6]-,:[#7]-,:[#7&!H0]', 0),
+    376: ('[#7]:[#6]:[#6]-,:[#6]', 0),
+    377: ('[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
+    378: ('[#33]-,:[#6]:[#6&!H0]', 0),
+    379: ('[Cl]-,:[#6]:[#6]-,:[Cl]', 0),
+    380: ('[#6]:[#6]:[#7&!H0]', 0),
+    381: ('[#7&!H0]-,:[#6&!H0]', 0),
+    382: ('[Cl]-,:[#6]-,:[#6]-,:[Cl]', 0),
+    383: ('[#7]:[#6]-,:[#6]:[#6]', 0),
+    384: ('[#16]-,:[#6]:[#6]-,:[#6]', 0),
+    385: ('[#16]-,:[#6]:[#6&!H0]', 0),
+    386: ('[#16]-,:[#6]:[#6]-,:[#7]', 0),
+    387: ('[#16]-,:[#6]:[#6]-,:[#8]', 0),
+    388: ('[#8]=,:[#6]-,:[#6]-,:[#6]', 0),
+    389: ('[#8]=,:[#6]-,:[#6]-,:[#7]', 0),
+    390: ('[#8]=,:[#6]-,:[#6]-,:[#8]', 0),
+    391: ('[#7]=,:[#6]-,:[#6]-,:[#6]', 0),
+    392: ('[#7]=,:[#6]-,:[#6&!H0]', 0),
+    393: ('[#6]-,:[#7]-,:[#6&!H0]', 0),
+    394: ('[#8]-,:[#6]:[#6]-,:[#6]', 0),
+    395: ('[#8]-,:[#6]:[#6&!H0]', 0),
+    396: ('[#8]-,:[#6]:[#6]-,:[#7]', 0),
+    397: ('[#8]-,:[#6]:[#6]-,:[#8]', 0),
+    398: ('[#7]-,:[#6]:[#6]-,:[#6]', 0),
+    399: ('[#7]-,:[#6]:[#6&!H0]', 0),
+    400: ('[#7]-,:[#6]:[#6]-,:[#7]', 0),
+    401: ('[#8]-,:[#6]-,:[#6]:[#6]', 0),
+    402: ('[#7]-,:[#6]-,:[#6]:[#6]', 0),
+    403: ('[Cl]-,:[#6]-,:[#6]-,:[#6]', 0),
+    404: ('[Cl]-,:[#6]-,:[#6]-,:[#8]', 0),
+    405: ('[#6]:[#6]-,:[#6]:[#6]', 0),
+    406: ('[#8]=,:[#6]-,:[#6]=,:[#6]', 0),
+    407: ('[Br]-,:[#6]-,:[#6]-,:[#6]', 0),
+    408: ('[#7]=,:[#6]-,:[#6]=,:[#6]', 0),
+    409: ('[#6]=,:[#6]-,:[#6]-,:[#6]', 0),
+    410: ('[#7]:[#6]-,:[#8&!H0]', 0),
+    411: ('[#8]=,:[#7]-,:c:c', 0),
+    412: ('[#8]-,:[#6]-,:[#7&!H0]', 0),
+    413: ('[#7]-,:[#6]-,:[#7]-,:[#6]', 0),
+    414: ('[Cl]-,:[#6]-,:[#6]=,:[#8]', 0),
+    415: ('[Br]-,:[#6]-,:[#6]=,:[#8]', 0),
+    416: ('[#8]-,:[#6]-,:[#8]-,:[#6]', 0),
+    417: ('[#6]=,:[#6]-,:[#6]=,:[#6]', 0),
+    418: ('[#6]:[#6]-,:[#8]-,:[#6]', 0),
+    419: ('[#8]-,:[#6]-,:[#6]-,:[#7]', 0),
+    420: ('[#8]-,:[#6]-,:[#6]-,:[#8]', 0),
+    421: ('N#[#6]-,:[#6]-,:[#6]', 0),
+    422: ('[#7]-,:[#6]-,:[#6]-,:[#7]', 0),
+    423: ('[#6]:[#6]-,:[#6]-,:[#6]', 0),
+    424: ('[#6&!H0]-,:[#8&!H0]', 0),
+    425: ('n:c:n:c', 0),
+    426: ('[#8]-,:[#6]-,:[#6]=,:[#6]', 0),
+    427: ('[#8]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
+    428: ('[#8]-,:[#6]-,:[#6]:[#6]-,:[#8]', 0),
+    429: ('[#7]=,:[#6]-,:[#6]:[#6&!H0]', 0),
+    430: ('c:c-,:[#7]-,:c:c', 0),
+    431: ('[#6]-,:[#6]:[#6]-,:c:c', 0),
+    432: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    433: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
+    434: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
+    435: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    436: ('[Cl]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
+    437: ('c:c-,:[#6]=,:[#6]-,:[#6]', 0),
+    438: ('[#6]-,:[#6]:[#6]-,:[#7]-,:[#6]', 0),
+    439: ('[#6]-,:[#16]-,:[#6]-,:[#6]-,:[#6]', 0),
+    440: ('[#7]-,:[#6]:[#6]-,:[#8&!H0]', 0),
+    441: ('[#8]=,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
+    442: ('[#6]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
+    443: ('[#6]-,:[#6]:[#6]-,:[#8&!H0]', 0),
+    444: ('[Cl]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    445: ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    446: ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
+    447: ('[#6]-,:[#8]-,:[#6]-,:[#6]=,:[#6]', 0),
+    448: ('c:c-,:[#6]-,:[#6]-,:[#6]', 0),
+    449: ('[#7]=,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
+    450: ('[#8]=,:[#6]-,:[#6]-,:c:c', 0),
+    451: ('[Cl]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
+    452: ('[#6H,#6H2,#6H3]-,:[#6]=,:[#6H,#6H2,#6H3]', 0),
+    453: ('[#7]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
+    454: ('[#7]-,:[#6]:[#6]:[#6]-,:[#7]', 0),
+    455: ('[#8]=,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
+    456: ('[#6]-,:c:c:[#6]-,:[#6]', 0),
+    457: ('[#6]-,:[#8]-,:[#6]-,:[#6]:c', 0),
+    458: ('[#8]=,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
+    459: ('[#8]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
+    460: ('[#7]-,:[#6]-,:[#6]-,:[#6]:c', 0),
+    461: ('[#6]-,:[#6]-,:[#6]-,:[#6]:c', 0),
+    462: ('[Cl]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
+    463: ('[#6]-,:[#8]-,:[#6]-,:[#8]-,:[#6]', 0),
+    464: ('[#7]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
+    465: ('[#7]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
+    466: ('[#6]-,:[#7]-,:[#6]-,:[#6]-,:[#6]', 0),
+    467: ('[#6]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
+    468: ('[#7]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
+    469: ('c:c:n:n:c', 0),
+    470: ('[#6]-,:[#6]-,:[#6]-,:[#8&!H0]', 0),
+    471: ('c:[#6]-,:[#6]-,:[#6]:c', 0),
+    472: ('[#8]-,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
+    473: ('c:c-,:[#8]-,:[#6]-,:[#6]', 0),
+    474: ('[#7]-,:[#6]:c:c:n', 0),
+    475: ('[#8]=,:[#6]-,:[#8]-,:[#6]:c', 0),
+    476: ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
+    477: ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#7]', 0),
+    478: ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#8]', 0),
+    479: ('[#6]-,:[#8]-,:[#6]:[#6]-,:[#6]', 0),
+    480: ('[#8]=,:[#33]-,:[#6]:c:c', 0),
+    481: ('[#6]-,:[#7]-,:[#6]-,:[#6]:c', 0),
+    482: ('[#16]-,:[#6]:c:c-,:[#7]', 0),
+    483: ('[#8]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
+    484: ('[#8]-,:[#6]:[#6]-,:[#8&!H0]', 0),
+    485: ('[#6]-,:[#6]-,:[#8]-,:[#6]:c', 0),
+    486: ('[#7]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
+    487: ('[#6]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
+    488: ('[#7]-,:[#7]-,:[#6]-,:[#7&!H0]', 0),
+    489: ('[#6]-,:[#7]-,:[#6]-,:[#7]-,:[#6]', 0),
+    490: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    491: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
+    492: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
+    493: ('[#6]=,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    494: ('[#8]-,:[#6]-,:[#6]-,:[#6]=,:[#6]', 0),
+    495: ('[#8]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
+    496: ('[#6&!H0]-,:[#6]-,:[#7&!H0]', 0),
+    497: ('[#6]-,:[#6]=,:[#7]-,:[#7]-,:[#6]', 0),
+    498: ('[#8]=,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
+    499: ('[#8]=,:[#6]-,:[#7]-,:[#6&!H0]', 0),
+    500: ('[#8]=,:[#6]-,:[#7]-,:[#6]-,:[#7]', 0),
+    501: ('[#8]=,:[#7]-,:[#6]:[#6]-,:[#7]', 0),
+    502: ('[#8]=,:[#7]-,:c:c-,:[#8]', 0),
+    503: ('[#8]=,:[#6]-,:[#7]-,:[#6]=,:[#8]', 0),
+    504: ('[#8]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
+    505: ('[#8]-,:[#6]:[#6]:[#6]-,:[#7]', 0),
+    506: ('[#8]-,:[#6]:[#6]:[#6]-,:[#8]', 0),
+    507: ('[#7]-,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
+    508: ('[#8]-,:[#6]-,:[#6]-,:[#6]:c', 0),
+    509: ('[#6]-,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
+    510: ('[#6]-,:[#7]-,:[#6]:[#6]-,:[#6]', 0),
+    511: ('[#6]-,:[#6]-,:[#16]-,:[#6]-,:[#6]', 0),
+    512: ('[#8]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
+    513: ('[#6]-,:[#6]=,:[#6]-,:[#6]-,:[#6]', 0),
+    514: ('[#8]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
+    515: ('[#8]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
+    516: ('[#8]-,:[#6]-,:[#6]-,:[#8&!H0]', 0),
+    517: ('[#6]-,:[#6]=,:[#6]-,:[#6]=,:[#6]', 0),
+    518: ('[#7]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
+    519: ('[#6]=,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
+    520: ('[#6]=,:[#6]-,:[#6]-,:[#8&!H0]', 0),
+    521: ('[#6]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
+    522: ('[Cl]-,:[#6]:[#6]-,:[#6]=,:[#8]', 0),
+    523: ('[Br]-,:[#6]:c:c-,:[#6]', 0),
+    524: ('[#8]=,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
+    525: ('[#8]=,:[#6]-,:[#6]=,:[#6&!H0]', 0),
+    526: ('[#8]=,:[#6]-,:[#6]=,:[#6]-,:[#7]', 0),
+    527: ('[#7]-,:[#6]-,:[#7]-,:[#6]:c', 0),
+    528: ('[Br]-,:[#6]-,:[#6]-,:[#6]:c', 0),
+    529: ('[#7]#[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    530: ('[#6]-,:[#6]=,:[#6]-,:[#6]:c', 0),
+    531: ('[#6]-,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
+    532: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    533: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    534: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
+    535: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
+    536: ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    537: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    538: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
+    539: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
+    540: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
+    541: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    542: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    543: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
+    544: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
+    545: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    546: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
+    547: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
+    548: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
+    549: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    550: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]', 0),
+    551: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    552: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]', 0),
+    553: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
+    554: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#8])-,:[#6]', 0),
+    555: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
+    556: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#7])-,:[#6]', 0),
+    557: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    558: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#8])-,:[#6]', 0),
+    559: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](=,:[#8])-,:[#6]', 0),
+    560: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#7])-,:[#6]', 0),
+    561: ('[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]', 0),
+    562: ('[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]-,:[#6]', 0),
+    563: ('[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]', 0),
+    564: ('[#6]-,:[#6](-,:[#6])(-,:[#6])-,:[#6]-,:[#6]', 0),
+    565: ('[#6]-,:[#6](-,:[#6])-,:[#6](-,:[#6])-,:[#6]', 0),
+    566: ('[#6]c1ccc([#6])cc1', 0),
+    567: ('[#6]c1ccc([#8])cc1', 0),
+    568: ('[#6]c1ccc([#16])cc1', 0),
+    569: ('[#6]c1ccc([#7])cc1', 0),
+    570: ('[#6]c1ccc(Cl)cc1', 0),
+    571: ('[#6]c1ccc(Br)cc1', 0),
+    572: ('[#8]c1ccc([#8])cc1', 0),
+    573: ('[#8]c1ccc([#16])cc1', 0),
+    574: ('[#8]c1ccc([#7])cc1', 0),
+    575: ('[#8]c1ccc(Cl)cc1', 0),
+    576: ('[#8]c1ccc(Br)cc1', 0),
+    577: ('[#16]c1ccc([#16])cc1', 0),
+    578: ('[#16]c1ccc([#7])cc1', 0),
+    579: ('[#16]c1ccc(Cl)cc1', 0),
+    580: ('[#16]c1ccc(Br)cc1', 0),
+    581: ('[#7]c1ccc([#7])cc1', 0),
+    582: ('[#7]c1ccc(Cl)cc1', 0),
+    583: ('[#7]c1ccc(Br)cc1', 0),
+    584: ('Clc1ccc(Cl)cc1', 0),
+    585: ('Clc1ccc(Br)cc1', 0),
+    586: ('Brc1ccc(Br)cc1', 0),
+    587: ('[#6]c1cc([#6])ccc1', 0),
+    588: ('[#6]c1cc([#8])ccc1', 0),
+    589: ('[#6]c1cc([#16])ccc1', 0),
+    590: ('[#6]c1cc([#7])ccc1', 0),
+    591: ('[#6]c1cc(Cl)ccc1', 0),
+    592: ('[#6]c1cc(Br)ccc1', 0),
+    593: ('[#8]c1cc([#8])ccc1', 0),
+    594: ('[#8]c1cc([#16])ccc1', 0),
+    595: ('[#8]c1cc([#7])ccc1', 0),
+    596: ('[#8]c1cc(Cl)ccc1', 0),
+    597: ('[#8]c1cc(Br)ccc1', 0),
+    598: ('[#16]c1cc([#16])ccc1', 0),
+    599: ('[#16]c1cc([#7])ccc1', 0),
+    600: ('[#16]c1cc(Cl)ccc1', 0),
+    601: ('[#16]c1cc(Br)ccc1', 0),
+    602: ('[#7]c1cc([#7])ccc1', 0),
+    603: ('[#7]c1cc(Cl)ccc1', 0),
+    604: ('[#7]c1cc(Br)ccc1', 0),
+    605: ('Clc1cc(Cl)ccc1', 0),
+    606: ('Clc1cc(Br)ccc1', 0),
+    607: ('Brc1cc(Br)ccc1', 0),
+    608: ('[#6]c1c([#6])cccc1', 0),
+    609: ('[#6]c1c([#8])cccc1', 0),
+    610: ('[#6]c1c([#16])cccc1', 0),
+    611: ('[#6]c1c([#7])cccc1', 0),
+    612: ('[#6]c1c(Cl)cccc1', 0),
+    613: ('[#6]c1c(Br)cccc1', 0),
+    614: ('[#8]c1c([#8])cccc1', 0),
+    615: ('[#8]c1c([#16])cccc1', 0),
+    616: ('[#8]c1c([#7])cccc1', 0),
+    617: ('[#8]c1c(Cl)cccc1', 0),
+    618: ('[#8]c1c(Br)cccc1', 0),
+    619: ('[#16]c1c([#16])cccc1', 0),
+    620: ('[#16]c1c([#7])cccc1', 0),
+    621: ('[#16]c1c(Cl)cccc1', 0),
+    622: ('[#16]c1c(Br)cccc1', 0),
+    623: ('[#7]c1c([#7])cccc1', 0),
+    624: ('[#7]c1c(Cl)cccc1', 0),
+    625: ('[#7]c1c(Br)cccc1', 0),
+    626: ('Clc1c(Cl)cccc1', 0),
+    627: ('Clc1c(Br)cccc1', 0),
+    628: ('Brc1c(Br)cccc1', 0),
+    629: ('[#6][#6]1[#6][#6][#6]([#6])[#6][#6]1', 0),
+    630: ('[#6][#6]1[#6][#6][#6]([#8])[#6][#6]1', 0),
+    631: ('[#6][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
+    632: ('[#6][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
+    633: ('[#6][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
+    634: ('[#6][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
+    635: ('[#8][#6]1[#6][#6][#6]([#8])[#6][#6]1', 0),
+    636: ('[#8][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
+    637: ('[#8][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
+    638: ('[#8][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
+    639: ('[#8][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
+    640: ('[#16][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
+    641: ('[#16][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
+    642: ('[#16][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
+    643: ('[#16][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
+    644: ('[#7][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
+    645: ('[#7][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
+    646: ('[#7][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
+    647: ('Cl[#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
+    648: ('Cl[#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
+    649: ('Br[#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
+    650: ('[#6][#6]1[#6][#6]([#6])[#6][#6][#6]1', 0),
+    651: ('[#6][#6]1[#6][#6]([#8])[#6][#6][#6]1', 0),
+    652: ('[#6][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
+    653: ('[#6][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
+    654: ('[#6][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
+    655: ('[#6][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
+    656: ('[#8][#6]1[#6][#6]([#8])[#6][#6][#6]1', 0),
+    657: ('[#8][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
+    658: ('[#8][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
+    659: ('[#8][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
+    660: ('[#8][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
+    661: ('[#16][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
+    662: ('[#16][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
+    663: ('[#16][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
+    664: ('[#16][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
+    665: ('[#7][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
+    666: ('[#7][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
+    667: ('[#7][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
+    668: ('Cl[#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
+    669: ('Cl[#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
+    670: ('Br[#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
+    671: ('[#6][#6]1[#6]([#6])[#6][#6][#6][#6]1', 0),
+    672: ('[#6][#6]1[#6]([#8])[#6][#6][#6][#6]1', 0),
+    673: ('[#6][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
+    674: ('[#6][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
+    675: ('[#6][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
+    676: ('[#6][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
+    677: ('[#8][#6]1[#6]([#8])[#6][#6][#6][#6]1', 0),
+    678: ('[#8][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
+    679: ('[#8][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
+    680: ('[#8][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
+    681: ('[#8][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
+    682: ('[#16][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
+    683: ('[#16][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
+    684: ('[#16][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
+    685: ('[#16][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
+    686: ('[#7][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
+    687: ('[#7][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
+    688: ('[#7][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
+    689: ('Cl[#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
+    690: ('Cl[#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
+    691: ('Br[#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
+    692: ('[#6][#6]1[#6][#6]([#6])[#6][#6]1', 0),
+    693: ('[#6][#6]1[#6][#6]([#8])[#6][#6]1', 0),
+    694: ('[#6][#6]1[#6][#6]([#16])[#6][#6]1', 0),
+    695: ('[#6][#6]1[#6][#6]([#7])[#6][#6]1', 0),
+    696: ('[#6][#6]1[#6][#6](Cl)[#6][#6]1', 0),
+    697: ('[#6][#6]1[#6][#6](Br)[#6][#6]1', 0),
+    698: ('[#8][#6]1[#6][#6]([#8])[#6][#6]1', 0),
+    699: ('[#8][#6]1[#6][#6]([#16])[#6][#6]1', 0),
+    700: ('[#8][#6]1[#6][#6]([#7])[#6][#6]1', 0),
+    701: ('[#8][#6]1[#6][#6](Cl)[#6][#6]1', 0),
+    702: ('[#8][#6]1[#6][#6](Br)[#6][#6]1', 0),
+    703: ('[#16][#6]1[#6][#6]([#16])[#6][#6]1', 0),
+    704: ('[#16][#6]1[#6][#6]([#7])[#6][#6]1', 0),
+    705: ('[#16][#6]1[#6][#6](Cl)[#6][#6]1', 0),
+    706: ('[#16][#6]1[#6][#6](Br)[#6][#6]1', 0),
+    707: ('[#7][#6]1[#6][#6]([#7])[#6][#6]1', 0),
+    708: ('[#7][#6]1[#6][#6](Cl)[#6][#6]1', 0),
+    709: ('[#7][#6]1[#6][#6](Br)[#6][#6]1', 0),
+    710: ('Cl[#6]1[#6][#6](Cl)[#6][#6]1', 0),
+    711: ('Cl[#6]1[#6][#6](Br)[#6][#6]1', 0),
+    712: ('Br[#6]1[#6][#6](Br)[#6][#6]1', 0),
+    713: ('[#6][#6]1[#6]([#6])[#6][#6][#6]1', 0),
+    714: ('[#6][#6]1[#6]([#8])[#6][#6][#6]1', 0),
+    715: ('[#6][#6]1[#6]([#16])[#6][#6][#6]1', 0),
+    716: ('[#6][#6]1[#6]([#7])[#6][#6][#6]1', 0),
+    717: ('[#6][#6]1[#6](Cl)[#6][#6][#6]1', 0),
+    718: ('[#6][#6]1[#6](Br)[#6][#6][#6]1', 0),
+    719: ('[#8][#6]1[#6]([#8])[#6][#6][#6]1', 0),
+    720: ('[#8][#6]1[#6]([#16])[#6][#6][#6]1', 0),
+    721: ('[#8][#6]1[#6]([#7])[#6][#6][#6]1', 0),
+    722: ('[#8][#6]1[#6](Cl)[#6][#6][#6]1', 0),
+    723: ('[#8][#6]1[#6](Br)[#6][#6][#6]1', 0),
+    724: ('[#16][#6]1[#6]([#16])[#6][#6][#6]1', 0),
+    725: ('[#16][#6]1[#6]([#7])[#6][#6][#6]1', 0),
+    726: ('[#16][#6]1[#6](Cl)[#6][#6][#6]1', 0),
+    727: ('[#16][#6]1[#6](Br)[#6][#6][#6]1', 0),
+    728: ('[#7][#6]1[#6]([#7])[#6][#6][#6]1', 0),
+    729: ('[#7][#6]1[#6](Cl)[#6][#6]1', 0),
+    730: ('[#7][#6]1[#6](Br)[#6][#6][#6]1', 0),
+    731: ('Cl[#6]1[#6](Cl)[#6][#6][#6]1', 0),
+    732: ('Cl[#6]1[#6](Br)[#6][#6][#6]1', 0),
+    733: ('Br[#6]1[#6](Br)[#6][#6][#6]1', 0)}
+PubchemKeys = None
+def InitKeys(keyList, keyDict):
+    """ *Internal Use Only*
+    generates SMARTS patterns for the keys, run once
+    """
+    assert len(keyList) == len(keyDict.keys()), 'length mismatch'
+    for key in keyDict.keys():
+        patt, count = keyDict[key]
+        if patt != '?':
+            sma = Chem.MolFromSmarts(patt)
+            if not sma:
+                print('SMARTS parser error for key #%d: %s' % (key, patt))
+            else:
+                keyList[key - 1] = sma, count
+def calcPubChemFingerPart1(mol, **kwargs):
+    """  Calculate PubChem Fingerprints （1-115; 263-881)
+    **Arguments**
+     - mol: the molecule to be fingerprinted
+     - any extra keyword arguments are ignored
+    **Returns**
+      a _DataStructs.SparseBitVect_ containing the fingerprint.
+    >>> m = Chem.MolFromSmiles('CNO')
+    >>> bv = PubChemFingerPart1(m)
+    >>> tuple(bv.GetOnBits())
+    (24, 68, 69, 71, 93, 94, 102, 124, 131, 139, 151, 158, 160, 161, 164)
+    >>> bv = PubChemFingerPart1(Chem.MolFromSmiles('CCC'))
+    >>> tuple(bv.GetOnBits())
+    (74, 114, 149, 155, 160)
+    """
+    global PubchemKeys
+    if PubchemKeys is None:
+        PubchemKeys = [(None, 0)] * len(smartsPatts.keys())
+        InitKeys(PubchemKeys, smartsPatts)
+    ctor = kwargs.get('ctor', DataStructs.SparseBitVect)
+    res = ctor(len(PubchemKeys) + 1)
+    for i, (patt, count) in enumerate(PubchemKeys):
+        if patt is not None:
+            if count == 0:
+                res[i + 1] = mol.HasSubstructMatch(patt)
+            else:
+                matches = mol.GetSubstructMatches(patt)
+                if len(matches) > count:
+                    res[i + 1] = 1
+    return res
+def func_1(mol, bits):
+    """ *Internal Use Only*
+    Calculate PubChem Fingerprints （116-263)
+    """
+    ringSize = []
+    temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
+    AllRingsAtom = mol.GetRingInfo().AtomRings()
+    for ring in AllRingsAtom:
+        ringSize.append(len(ring))
+        for k, v in temp.items():
+            if len(ring) == k:
+                temp[k] += 1
+    if temp[3] >= 2:
+        bits[0] = 1
+        bits[7] = 1
+    elif temp[3] == 1:
+        bits[0] = 1
+    else:
+        pass
+    if temp[4] >= 2:
+        bits[14] = 1
+        bits[21] = 1
+    elif temp[4] == 1:
+        bits[14] = 1
+    else:
+        pass
+    if temp[5] >= 5:
+        bits[28] = 1
+        bits[35] = 1
+        bits[42] = 1
+        bits[49] = 1
+        bits[56] = 1
+    elif temp[5] == 4:
+        bits[28] = 1
+        bits[35] = 1
+        bits[42] = 1
+        bits[49] = 1
+    elif temp[5] == 3:
+        bits[28] = 1
+        bits[35] = 1
+        bits[42] = 1
+    elif temp[5] == 2:
+        bits[28] = 1
+        bits[35] = 1
+    elif temp[5] == 1:
+        bits[28] = 1
+    else:
+        pass
+    if temp[6] >= 5:
+        bits[63] = 1
+        bits[70] = 1
+        bits[77] = 1
+        bits[84] = 1
+        bits[91] = 1
+    elif temp[6] == 4:
+        bits[63] = 1
+        bits[70] = 1
+        bits[77] = 1
+        bits[84] = 1
+    elif temp[6] == 3:
+        bits[63] = 1
+        bits[70] = 1
+        bits[77] = 1
+    elif temp[6] == 2:
+        bits[63] = 1
+        bits[70] = 1
+    elif temp[6] == 1:
+        bits[63] = 1
+    else:
+        pass
+    if temp[7] >= 2:
+        bits[98] = 1
+        bits[105] = 1
+    elif temp[7] == 1:
+        bits[98] = 1
+    else:
+        pass
+    if temp[8] >= 2:
+        bits[112] = 1
+        bits[119] = 1
+    elif temp[8] == 1:
+        bits[112] = 1
+    else:
+        pass
+    if temp[9] >= 1:
+        bits[126] = 1
+    else:
+        pass
+    if temp[10] >= 1:
+        bits[133] = 1
+    else:
+        pass
+    return ringSize, bits
+def func_2(mol, bits):
+    """ *Internal Use Only*
+    saturated or aromatic carbon-only ring
+    """
+    AllRingsBond = mol.GetRingInfo().BondRings()
+    ringSize = []
+    temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
+    for ring in AllRingsBond:
+        ######### saturated
+        nonsingle = False
+        for bondIdx in ring:
+            if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
+                nonsingle = True
+                break
+        if nonsingle == False:
+            ringSize.append(len(ring))
+            for k, v in temp.items():
+                if len(ring) == k:
+                    temp[k] += 1
+        ######## aromatic carbon-only
+        aromatic = True
+        AllCarb = True
+        for bondIdx in ring:
+            if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'AROMATIC':
+                aromatic = False
+                break
+        for bondIdx in ring:
+            BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
+            EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
+            if BeginAtom.GetAtomicNum() != 6 or EndAtom.GetAtomicNum() != 6:
+                AllCarb = False
+                break
+        if aromatic == True and AllCarb == True:
+            ringSize.append(len(ring))
+            for k, v in temp.items():
+                if len(ring) == k:
+                    temp[k] += 1
+    if temp[3] >= 2:
+        bits[1] = 1
+        bits[8] = 1
+    elif temp[3] == 1:
+        bits[1] = 1
+    else:
+        pass
+    if temp[4] >= 2:
+        bits[15] = 1
+        bits[22] = 1
+    elif temp[4] == 1:
+        bits[15] = 1
+    else:
+        pass
+    if temp[5] >= 5:
+        bits[29] = 1
+        bits[36] = 1
+        bits[43] = 1
+        bits[50] = 1
+        bits[57] = 1
+    elif temp[5] == 4:
+        bits[29] = 1
+        bits[36] = 1
+        bits[43] = 1
+        bits[50] = 1
+    elif temp[5] == 3:
+        bits[29] = 1
+        bits[36] = 1
+        bits[43] = 1
+    elif temp[5] == 2:
+        bits[29] = 1
+        bits[36] = 1
+    elif temp[5] == 1:
+        bits[29] = 1
+    else:
+        pass
+    if temp[6] >= 5:
+        bits[64] = 1
+        bits[71] = 1
+        bits[78] = 1
+        bits[85] = 1
+        bits[92] = 1
+    elif temp[6] == 4:
+        bits[64] = 1
+        bits[71] = 1
+        bits[78] = 1
+        bits[85] = 1
+    elif temp[6] == 3:
+        bits[64] = 1
+        bits[71] = 1
+        bits[78] = 1
+    elif temp[6] == 2:
+        bits[64] = 1
+        bits[71] = 1
+    elif temp[6] == 1:
+        bits[64] = 1
+    else:
+        pass
+    if temp[7] >= 2:
+        bits[99] = 1
+        bits[106] = 1
+    elif temp[7] == 1:
+        bits[99] = 1
+    else:
+        pass
+    if temp[8] >= 2:
+        bits[113] = 1
+        bits[120] = 1
+    elif temp[8] == 1:
+        bits[113] = 1
+    else:
+        pass
+    if temp[9] >= 1:
+        bits[127] = 1
+    else:
+        pass
+    if temp[10] >= 1:
+        bits[134] = 1
+    else:
+        pass
+    return ringSize, bits
+def func_3(mol, bits):
+    """ *Internal Use Only*
+    saturated or aromatic nitrogen-containing
+    """
+    AllRingsBond = mol.GetRingInfo().BondRings()
+    ringSize = []
+    temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
+    for ring in AllRingsBond:
+        ######### saturated
+        nonsingle = False
+        for bondIdx in ring:
+            if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
+                nonsingle = True
+                break
+        if nonsingle == False:
+            ringSize.append(len(ring))
+            for k, v in temp.items():
+                if len(ring) == k:
+                    temp[k] += 1
+        ######## aromatic nitrogen-containing
+        aromatic = True
+        ContainNitro = False
+        for bondIdx in ring:
+            if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'AROMATIC':
+                aromatic = False
+                break
+        for bondIdx in ring:
+            BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
+            EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
+            if BeginAtom.GetAtomicNum() == 7 or EndAtom.GetAtomicNum() == 7:
+                ContainNitro = True
+                break
+        if aromatic == True and ContainNitro == True:
+            ringSize.append(len(ring))
+            for k, v in temp.items():
+                if len(ring) == k:
+                    temp[k] += 1
+    if temp[3] >= 2:
+        bits[2] = 1
+        bits[9] = 1
+    elif temp[3] == 1:
+        bits[2] = 1
+    else:
+        pass
+    if temp[4] >= 2:
+        bits[16] = 1
+        bits[23] = 1
+    elif temp[4] == 1:
+        bits[16] = 1
+    else:
+        pass
+    if temp[5] >= 5:
+        bits[30] = 1
+        bits[37] = 1
+        bits[44] = 1
+        bits[51] = 1
+        bits[58] = 1
+    elif temp[5] == 4:
+        bits[30] = 1
+        bits[37] = 1
+        bits[44] = 1
+        bits[51] = 1
+    elif temp[5] == 3:
+        bits[30] = 1
+        bits[37] = 1
+        bits[44] = 1
+    elif temp[5] == 2:
+        bits[30] = 1
+        bits[37] = 1
+    elif temp[5] == 1:
+        bits[30] = 1
+    else:
+        pass
+    if temp[6] >= 5:
+        bits[65] = 1
+        bits[72] = 1
+        bits[79] = 1
+        bits[86] = 1
+        bits[93] = 1
+    elif temp[6] == 4:
+        bits[65] = 1
+        bits[72] = 1
+        bits[79] = 1
+        bits[86] = 1
+    elif temp[6] == 3:
+        bits[65] = 1
+        bits[72] = 1
+        bits[79] = 1
+    elif temp[6] == 2:
+        bits[65] = 1
+        bits[72] = 1
+    elif temp[6] == 1:
+        bits[65] = 1
+    else:
+        pass
+    if temp[7] >= 2:
+        bits[100] = 1
+        bits[107] = 1
+    elif temp[7] == 1:
+        bits[100] = 1
+    else:
+        pass
+    if temp[8] >= 2:
+        bits[114] = 1
+        bits[121] = 1
+    elif temp[8] == 1:
+        bits[114] = 1
+    else:
+        pass
+    if temp[9] >= 1:
+        bits[128] = 1
+    else:
+        pass
+    if temp[10] >= 1:
+        bits[135] = 1
+    else:
+        pass
+    return ringSize, bits
+def func_4(mol, bits):
+    """ *Internal Use Only*
+    saturated or aromatic heteroatom-containing
+    """
+    AllRingsBond = mol.GetRingInfo().BondRings()
+    ringSize = []
+    temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
+    for ring in AllRingsBond:
+        ######### saturated
+        nonsingle = False
+        for bondIdx in ring:
+            if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
+                nonsingle = True
+                break
+        if nonsingle == False:
+            ringSize.append(len(ring))
+            for k, v in temp.items():
+                if len(ring) == k:
+                    temp[k] += 1
+        ######## aromatic heteroatom-containing
+        aromatic = True
+        heteroatom = False
+        for bondIdx in ring:
+            if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'AROMATIC':
+                aromatic = False
+                break
+        for bondIdx in ring:
+            BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
+            EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
+            if BeginAtom.GetAtomicNum() not in [1, 6] or EndAtom.GetAtomicNum() not in [1, 6]:
+                heteroatom = True
+                break
+        if aromatic == True and heteroatom == True:
+            ringSize.append(len(ring))
+            for k, v in temp.items():
+                if len(ring) == k:
+                    temp[k] += 1
+    if temp[3] >= 2:
+        bits[3] = 1
+        bits[10] = 1
+    elif temp[3] == 1:
+        bits[3] = 1
+    else:
+        pass
+    if temp[4] >= 2:
+        bits[17] = 1
+        bits[24] = 1
+    elif temp[4] == 1:
+        bits[17] = 1
+    else:
+        pass
+    if temp[5] >= 5:
+        bits[31] = 1
+        bits[38] = 1
+        bits[45] = 1
+        bits[52] = 1
+        bits[59] = 1
+    elif temp[5] == 4:
+        bits[31] = 1
+        bits[38] = 1
+        bits[45] = 1
+        bits[52] = 1
+    elif temp[5] == 3:
+        bits[31] = 1
+        bits[38] = 1
+        bits[45] = 1
+    elif temp[5] == 2:
+        bits[31] = 1
+        bits[38] = 1
+    elif temp[5] == 1:
+        bits[31] = 1
+    else:
+        pass
+    if temp[6] >= 5:
+        bits[66] = 1
+        bits[73] = 1
+        bits[80] = 1
+        bits[87] = 1
+        bits[94] = 1
+    elif temp[6] == 4:
+        bits[66] = 1
+        bits[73] = 1
+        bits[80] = 1
+        bits[87] = 1
+    elif temp[6] == 3:
+        bits[66] = 1
+        bits[73] = 1
+        bits[80] = 1
+    elif temp[6] == 2:
+        bits[66] = 1
+        bits[73] = 1
+    elif temp[6] == 1:
+        bits[66] = 1
+    else:
+        pass
+    if temp[7] >= 2:
+        bits[101] = 1
+        bits[108] = 1
+    elif temp[7] == 1:
+        bits[101] = 1
+    else:
+        pass
+    if temp[8] >= 2:
+        bits[115] = 1
+        bits[122] = 1
+    elif temp[8] == 1:
+        bits[115] = 1
+    else:
+        pass
+    if temp[9] >= 1:
+        bits[129] = 1
+    else:
+        pass
+    if temp[10] >= 1:
+        bits[136] = 1
+    else:
+        pass
+    return ringSize, bits
+def func_5(mol, bits):
+    """ *Internal Use Only*
+    unsaturated non-aromatic carbon-only
+    """
+    ringSize = []
+    AllRingsBond = mol.GetRingInfo().BondRings()
+    temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
+    for ring in AllRingsBond:
+        unsaturated = False
+        nonaromatic = True
+        Allcarb = True
+        ######### unsaturated
+        for bondIdx in ring:
+            if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
+                unsaturated = True
+                break
+        ######## non-aromatic
+        for bondIdx in ring:
+            if mol.GetBondWithIdx(bondIdx).GetBondType().name == 'AROMATIC':
+                nonaromatic = False
+                break
+        ######## allcarb
+        for bondIdx in ring:
+            BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
+            EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
+            if BeginAtom.GetAtomicNum() != 6 or EndAtom.GetAtomicNum() != 6:
+                Allcarb = False
+                break
+        if unsaturated == True and nonaromatic == True and Allcarb == True:
+            ringSize.append(len(ring))
+            for k, v in temp.items():
+                if len(ring) == k:
+                    temp[k] += 1
+    if temp[3] >= 2:
+        bits[4] = 1
+        bits[11] = 1
+    elif temp[3] == 1:
+        bits[4] = 1
+    else:
+        pass
+    if temp[4] >= 2:
+        bits[18] = 1
+        bits[25] = 1
+    elif temp[4] == 1:
+        bits[18] = 1
+    else:
+        pass
+    if temp[5] >= 5:
+        bits[32] = 1
+        bits[39] = 1
+        bits[46] = 1
+        bits[53] = 1
+        bits[60] = 1
+    elif temp[5] == 4:
+        bits[32] = 1
+        bits[39] = 1
+        bits[46] = 1
+        bits[53] = 1
+    elif temp[5] == 3:
+        bits[32] = 1
+        bits[39] = 1
+        bits[46] = 1
+    elif temp[5] == 2:
+        bits[32] = 1
+        bits[39] = 1
+    elif temp[5] == 1:
+        bits[32] = 1
+    else:
+        pass
+    if temp[6] >= 5:
+        bits[67] = 1
+        bits[74] = 1
+        bits[81] = 1
+        bits[88] = 1
+        bits[95] = 1
+    elif temp[6] == 4:
+        bits[67] = 1
+        bits[74] = 1
+        bits[81] = 1
+        bits[88] = 1
+    elif temp[6] == 3:
+        bits[67] = 1
+        bits[74] = 1
+        bits[81] = 1
+    elif temp[6] == 2:
+        bits[67] = 1
+        bits[74] = 1
+    elif temp[6] == 1:
+        bits[67] = 1
+    else:
+        pass
+    if temp[7] >= 2:
+        bits[102] = 1
+        bits[109] = 1
+    elif temp[7] == 1:
+        bits[102] = 1
+    else:
+        pass
+    if temp[8] >= 2:
+        bits[116] = 1
+        bits[123] = 1
+    elif temp[8] == 1:
+        bits[116] = 1
+    else:
+        pass
+    if temp[9] >= 1:
+        bits[130] = 1
+    else:
+        pass
+    if temp[10] >= 1:
+        bits[137] = 1
+    else:
+        pass
+    return ringSize, bits
+def func_6(mol, bits):
+    """ *Internal Use Only*
+    unsaturated non-aromatic nitrogen-containing
+    """
+    ringSize = []
+    AllRingsBond = mol.GetRingInfo().BondRings()
+    temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
+    for ring in AllRingsBond:
+        unsaturated = False
+        nonaromatic = True
+        ContainNitro = False
+        ######### unsaturated
+        for bondIdx in ring:
+            if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
+                unsaturated = True
+                break
+        ######## non-aromatic
+        for bondIdx in ring:
+            if mol.GetBondWithIdx(bondIdx).GetBondType().name == 'AROMATIC':
+                nonaromatic = False
+                break
+        ######## nitrogen-containing
+        for bondIdx in ring:
+            BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
+            EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
+            if BeginAtom.GetAtomicNum() == 7 or EndAtom.GetAtomicNum() == 7:
+                ContainNitro = True
+                break
+        if unsaturated == True and nonaromatic == True and ContainNitro == True:
+            ringSize.append(len(ring))
+            for k, v in temp.items():
+                if len(ring) == k:
+                    temp[k] += 1
+    if temp[3] >= 2:
+        bits[5] = 1
+        bits[12] = 1
+    elif temp[3] == 1:
+        bits[5] = 1
+    else:
+        pass
+    if temp[4] >= 2:
+        bits[19] = 1
+        bits[26] = 1
+    elif temp[4] == 1:
+        bits[19] = 1
+    else:
+        pass
+    if temp[5] >= 5:
+        bits[33] = 1
+        bits[40] = 1
+        bits[47] = 1
+        bits[54] = 1
+        bits[61] = 1
+    elif temp[5] == 4:
+        bits[33] = 1
+        bits[40] = 1
+        bits[47] = 1
+        bits[54] = 1
+    elif temp[5] == 3:
+        bits[33] = 1
+        bits[40] = 1
+        bits[47] = 1
+    elif temp[5] == 2:
+        bits[33] = 1
+        bits[40] = 1
+    elif temp[5] == 1:
+        bits[33] = 1
+    else:
+        pass
+    if temp[6] >= 5:
+        bits[68] = 1
+        bits[75] = 1
+        bits[82] = 1
+        bits[89] = 1
+        bits[96] = 1
+    elif temp[6] == 4:
+        bits[68] = 1
+        bits[75] = 1
+        bits[82] = 1
+        bits[89] = 1
+    elif temp[6] == 3:
+        bits[68] = 1
+        bits[75] = 1
+        bits[82] = 1
+    elif temp[6] == 2:
+        bits[68] = 1
+        bits[75] = 1
+    elif temp[6] == 1:
+        bits[68] = 1
+    else:
+        pass
+    if temp[7] >= 2:
+        bits[103] = 1
+        bits[110] = 1
+    elif temp[7] == 1:
+        bits[103] = 1
+    else:
+        pass
+    if temp[8] >= 2:
+        bits[117] = 1
+        bits[124] = 1
+    elif temp[8] == 1:
+        bits[117] = 1
+    else:
+        pass
+    if temp[9] >= 1:
+        bits[131] = 1
+    else:
+        pass
+    if temp[10] >= 1:
+        bits[138] = 1
+    else:
+        pass
+    return ringSize, bits
+def func_7(mol, bits):
+    """ *Internal Use Only*
+    unsaturated non-aromatic heteroatom-containing
+    """
+    ringSize = []
+    AllRingsBond = mol.GetRingInfo().BondRings()
+    temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
+    for ring in AllRingsBond:
+        unsaturated = False
+        nonaromatic = True
+        heteroatom = False
+        ######### unsaturated
+        for bondIdx in ring:
+            if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
+                unsaturated = True
+                break
+        ######## non-aromatic
+        for bondIdx in ring:
+            if mol.GetBondWithIdx(bondIdx).GetBondType().name == 'AROMATIC':
+                nonaromatic = False
+                break
+        ######## heteroatom-containing
+        for bondIdx in ring:
+            BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
+            EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
+            if BeginAtom.GetAtomicNum() not in [1, 6] or EndAtom.GetAtomicNum() not in [1, 6]:
+                heteroatom = True
+                break
+        if unsaturated == True and nonaromatic == True and heteroatom == True:
+            ringSize.append(len(ring))
+            for k, v in temp.items():
+                if len(ring) == k:
+                    temp[k] += 1
+    if temp[3] >= 2:
+        bits[6] = 1
+        bits[13] = 1
+    elif temp[3] == 1:
+        bits[6] = 1
+    else:
+        pass
+    if temp[4] >= 2:
+        bits[20] = 1
+        bits[27] = 1
+    elif temp[4] == 1:
+        bits[20] = 1
+    else:
+        pass
+    if temp[5] >= 5:
+        bits[34] = 1
+        bits[41] = 1
+        bits[48] = 1
+        bits[55] = 1
+        bits[62] = 1
+    elif temp[5] == 4:
+        bits[34] = 1
+        bits[41] = 1
+        bits[48] = 1
+        bits[55] = 1
+    elif temp[5] == 3:
+        bits[34] = 1
+        bits[41] = 1
+        bits[48] = 1
+    elif temp[5] == 2:
+        bits[34] = 1
+        bits[41] = 1
+    elif temp[5] == 1:
+        bits[34] = 1
+    else:
+        pass
+    if temp[6] >= 5:
+        bits[69] = 1
+        bits[76] = 1
+        bits[83] = 1
+        bits[90] = 1
+        bits[97] = 1
+    elif temp[6] == 4:
+        bits[69] = 1
+        bits[76] = 1
+        bits[83] = 1
+        bits[90] = 1
+    elif temp[6] == 3:
+        bits[69] = 1
+        bits[76] = 1
+        bits[83] = 1
+    elif temp[6] == 2:
+        bits[69] = 1
+        bits[76] = 1
+    elif temp[6] == 1:
+        bits[69] = 1
+    else:
+        pass
+    if temp[7] >= 2:
+        bits[104] = 1
+        bits[111] = 1
+    elif temp[7] == 1:
+        bits[104] = 1
+    else:
+        pass
+    if temp[8] >= 2:
+        bits[118] = 1
+        bits[125] = 1
+    elif temp[8] == 1:
+        bits[118] = 1
+    else:
+        pass
+    if temp[9] >= 1:
+        bits[132] = 1
+    else:
+        pass
+    if temp[10] >= 1:
+        bits[139] = 1
+    else:
+        pass
+    return ringSize, bits
+def func_8(mol, bits):
+    """ *Internal Use Only*
+    aromatic rings or hetero-aromatic rings
+    """
+    AllRingsBond = mol.GetRingInfo().BondRings()
+    temp = {'aromatic': 0, 'heteroatom': 0}
+    for ring in AllRingsBond:
+        aromatic = True
+        heteroatom = False
+        for bondIdx in ring:
+            if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'AROMATIC':
+                aromatic = False
+                break
+        if aromatic == True:
+            temp['aromatic'] += 1
+        for bondIdx in ring:
+            BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
+            EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
+            if BeginAtom.GetAtomicNum() not in [1, 6] or EndAtom.GetAtomicNum() not in [1, 6]:
+                heteroatom = True
+                break
+        if heteroatom == True:
+            temp['heteroatom'] += 1
+    if temp['aromatic'] >= 4:
+        bits[140] = 1
+        bits[142] = 1
+        bits[144] = 1
+        bits[146] = 1
+    elif temp['aromatic'] == 3:
+        bits[140] = 1
+        bits[142] = 1
+        bits[144] = 1
+    elif temp['aromatic'] == 2:
+        bits[140] = 1
+        bits[142] = 1
+    elif temp['aromatic'] == 1:
+        bits[140] = 1
+    else:
+        pass
+    if temp['aromatic'] >= 4 and temp['heteroatom'] >= 4:
+        bits[141] = 1
+        bits[143] = 1
+        bits[145] = 1
+        bits[147] = 1
+    elif temp['aromatic'] == 3 and temp['heteroatom'] == 3:
+        bits[141] = 1
+        bits[143] = 1
+        bits[145] = 1
+    elif temp['aromatic'] == 2 and temp['heteroatom'] == 2:
+        bits[141] = 1
+        bits[143] = 1
+    elif temp['aromatic'] == 1 and temp['heteroatom'] == 1:
+        bits[141] = 1
+    else:
+        pass
+    return bits
+def calcPubChemFingerPart2(mol):  # 116-263
+    """ *Internal Use Only*
+    Calculate PubChem Fingerprints （116-263)
+    """
+    bits = [0] * 148
+    bits = func_1(mol, bits)[1]
+    bits = func_2(mol, bits)[1]
+    bits = func_3(mol, bits)[1]
+    bits = func_4(mol, bits)[1]
+    bits = func_5(mol, bits)[1]
+    bits = func_6(mol, bits)[1]
+    bits = func_7(mol, bits)[1]
+    bits = func_8(mol, bits)
+    return bits
+def GetPubChemFPs(mol):
+    """*Internal Use Only*
+    Calculate PubChem Fingerprints
+    """
+    mol = Chem.AddHs(mol)
+    AllBits = [0] * 881
+    res1 = list(calcPubChemFingerPart1(mol).ToBitString())
+    for index, item in enumerate(res1[1:116]):
+        if item == '1':
+            AllBits[index] = 1
+    for index2, item2 in enumerate(res1[116:734]):
+        if item2 == '1':
+            AllBits[index2 + 115 + 148] = 1
+    res2 = calcPubChemFingerPart2(mol)
+    for index3, item3 in enumerate(res2):
+        if item3 == 1:
+            AllBits[index3 + 115] = 1
+    AllBits = np.array(AllBits, dtype=np.bool_)
+    return AllBits
+# ------------------------------------
+file_path = os.path.dirname(__file__)
+def GetPubChemFPInfos():
+    return pd.read_excel(os.path.join(file_path, 'pubchemfp.xlsx'))
+if __name__ == '__main__':
+    print('-' * 10 + 'START' + '-' * 10)
+    SMILES = 'C1=NC2NC3=CNCC3=CC2CC1'
+    mol = Chem.MolFromSmiles(SMILES)
+    mol2 = Chem.AddHs(mol)
+    result = GetPubChemFPs(mol2)
+    print('Molecule: %s' % SMILES)
+    print('-' * 25)
+    print('Results: %s' % result)
+    print('-' * 10 + 'END' + '-' * 10)

deepscreen/data/featurizers/fingerprint/pubchemfp.xlsx ADDED Viewed

Binary file (41.2 kB). View file

deepscreen/data/featurizers/fingerprint/rdkitfp.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""
+topological fingerprint
+"""
+import numpy as np
+from rdkit.Chem.rdmolops import RDKFingerprint
+from rdkit.Chem import DataStructs
+_type = 'topological-based'
+def GetRDkitFPs(mol, nBits=2048, return_bitInfo=False):
+    """
+    #################################################################
+    Calculate Daylight-like fingerprint or topological fingerprint
+    (1024 bits).
+    Usage:
+        result=CalculateDaylightFingerprint(mol)
+        Input: mol is a molecule object.
+        Output: result is a tuple form. The first is the number of
+        fingerprints. The second is a dict form whose keys are the
+        position which this molecule has some substructure. The third
+        is the DataStructs which is used for calculating the similarity.
+    #################################################################
+    """
+    bitInfo = {}
+    fp = RDKFingerprint(mol, fpSize=nBits, bitInfo=bitInfo)
+    arr = np.zeros((0,), dtype=np.bool_)
+    DataStructs.ConvertToNumpyArray(fp, arr)
+    if return_bitInfo:
+        return arr, return_bitInfo
+    return arr

deepscreen/data/featurizers/fingerprint/smarts_maccskey.py ADDED Viewed

	@@ -0,0 +1,178 @@

+smartsPatts = {
+    'MACCSFP0': (None, 0),
+    # ignore, Bit 0 is a placeholder and should be ignored: https://github.com/rdkit/rdkit/issues/1726
+    'MACCSFP1': ('?', 0),
+    'MACCSFP2': ('[#104]', 0),
+    'MACCSFP3': ('[#32,#33,#34,#50,#51,#52,#82,#83,#84]', 0),
+    'MACCSFP4': ('[Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr]', 0),
+    'MACCSFP5': ('[Sc,Ti,Y,Zr,Hf]', 0),
+    'MACCSFP6': ('[La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu]', 0),
+    'MACCSFP7': ('[V,Cr,Mn,Nb,Mo,Tc,Ta,W,Re]', 0),
+    'MACCSFP8': ('[!#6;!#1]1~*~*~*~1', 0),
+    'MACCSFP9': ('[Fe,Co,Ni,Ru,Rh,Pd,Os,Ir,Pt]', 0),
+    'MACCSFP10': ('[Be,Mg,Ca,Sr,Ba,Ra]', 0),
+    'MACCSFP11': ('*1~*~*~*~1', 0),
+    'MACCSFP12': ('[Cu,Zn,Ag,Cd,Au,Hg]', 0),
+    'MACCSFP13': ('[#8]~[#7](~[#6])~[#6]', 0),
+    'MACCSFP14': ('[#16]-[#16]', 0),
+    'MACCSFP15': ('[#8]~[#6](~[#8])~[#8]', 0),
+    'MACCSFP16': ('[!#6;!#1]1~*~*~1', 0),
+    'MACCSFP17': ('[#6]#[#6]', 0),
+    'MACCSFP18': ('[#5,#13,#31,#49,#81]', 0),
+    'MACCSFP19': ('*1~*~*~*~*~*~*~1', 0),
+    'MACCSFP20': ('[#14]', 0),
+    'MACCSFP21': ('[#6]=[#6](~[!#6;!#1])~[!#6;!#1]', 0),
+    'MACCSFP22': ('*1~*~*~1', 0),
+    'MACCSFP23': ('[#7]~[#6](~[#8])~[#8]', 0),
+    'MACCSFP24': ('[#7]-[#8]', 0),
+    'MACCSFP25': ('[#7]~[#6](~[#7])~[#7]', 0),
+    'MACCSFP26': ('[#6]=;@[#6](@*)@*', 0),
+    'MACCSFP27': ('[I]', 0),
+    'MACCSFP28': ('[!#6;!#1]~[CH2]~[!#6;!#1]', 0),
+    'MACCSFP29': ('[#15]', 0),
+    'MACCSFP30': ('[#6]~[!#6;!#1](~[#6])(~[#6])~*', 0),
+    'MACCSFP31': ('[!#6;!#1]~[F,Cl,Br,I]', 0),
+    'MACCSFP32': ('[#6]~[#16]~[#7]', 0),
+    'MACCSFP33': ('[#7]~[#16]', 0),
+    'MACCSFP34': ('[CH2]=*', 0),
+    'MACCSFP35': ('[Li,Na,K,Rb,Cs,Fr]', 0),
+    'MACCSFP36': ('[#16R]', 0),
+    'MACCSFP37': ('[#7]~[#6](~[#8])~[#7]', 0),
+    'MACCSFP38': ('[#7]~[#6](~[#6])~[#7]', 0),
+    'MACCSFP39': ('[#8]~[#16](~[#8])~[#8]', 0),
+    'MACCSFP40': ('[#16]-[#8]', 0),
+    'MACCSFP41': ('[#6]#[#7]', 0),
+    'MACCSFP42': ('F', 0),
+    'MACCSFP43': ('[!#6;!#1;!H0]~*~[!#6;!#1;!H0]', 0),
+    'MACCSFP44': ('?', 0),
+    'MACCSFP45': ('[#6]=[#6]~[#7]', 0),
+    'MACCSFP46': ('Br', 0),
+    'MACCSFP47': ('[#16]~*~[#7]', 0),
+    'MACCSFP48': ('[#8]~[!#6;!#1](~[#8])(~[#8])', 0),
+    'MACCSFP49': ('[!+0]', 0),
+    'MACCSFP50': ('[#6]=[#6](~[#6])~[#6]', 0),
+    'MACCSFP51': ('[#6]~[#16]~[#8]', 0),
+    'MACCSFP52': ('[#7]~[#7]', 0),
+    'MACCSFP53': ('[!#6;!#1;!H0]~*~*~*~[!#6;!#1;!H0]', 0),
+    'MACCSFP54': ('[!#6;!#1;!H0]~*~*~[!#6;!#1;!H0]', 0),
+    'MACCSFP55': ('[#8]~[#16]~[#8]', 0),
+    'MACCSFP56': ('[#8]~[#7](~[#8])~[#6]', 0),
+    'MACCSFP57': ('[#8R]', 0),
+    'MACCSFP58': ('[!#6;!#1]~[#16]~[!#6;!#1]', 0),
+    'MACCSFP59': ('[#16]!:*:*', 0),
+    'MACCSFP60': ('[#16]=[#8]', 0),
+    'MACCSFP61': ('*~[#16](~*)~*', 0),
+    'MACCSFP62': ('*@*!@*@*', 0),
+    'MACCSFP63': ('[#7]=[#8]', 0),
+    'MACCSFP64': ('*@*!@[#16]', 0),
+    'MACCSFP65': ('c:n', 0),
+    'MACCSFP66': ('[#6]~[#6](~[#6])(~[#6])~*', 0),
+    'MACCSFP67': ('[!#6;!#1]~[#16]', 0),
+    'MACCSFP68': ('[!#6;!#1;!H0]~[!#6;!#1;!H0]', 0),
+    'MACCSFP69': ('[!#6;!#1]~[!#6;!#1;!H0]', 0),
+    'MACCSFP70': ('[!#6;!#1]~[#7]~[!#6;!#1]', 0),
+    'MACCSFP71': ('[#7]~[#8]', 0),
+    'MACCSFP72': ('[#8]~*~*~[#8]', 0),
+    'MACCSFP73': ('[#16]=*', 0),
+    'MACCSFP74': ('[CH3]~*~[CH3]', 0),
+    'MACCSFP75': ('*!@[#7]@*', 0),
+    'MACCSFP76': ('[#6]=[#6](~*)~*', 0),
+    'MACCSFP77': ('[#7]~*~[#7]', 0),
+    'MACCSFP78': ('[#6]=[#7]', 0),
+    'MACCSFP79': ('[#7]~*~*~[#7]', 0),
+    'MACCSFP80': ('[#7]~*~*~*~[#7]', 0),
+    'MACCSFP81': ('[#16]~*(~*)~*', 0),
+    'MACCSFP82': ('*~[CH2]~[!#6;!#1;!H0]', 0),
+    'MACCSFP83': ('[!#6;!#1]1~*~*~*~*~1', 0),
+    'MACCSFP84': ('[NH2]', 0),
+    'MACCSFP85': ('[#6]~[#7](~[#6])~[#6]', 0),
+    'MACCSFP86': ('[C;H2,H3][!#6;!#1][C;H2,H3]', 0),
+    'MACCSFP87': ('[F,Cl,Br,I]!@*@*', 0),
+    'MACCSFP88': ('[#16]', 0),
+    'MACCSFP89': ('[#8]~*~*~*~[#8]', 0),
+    'MACCSFP90': (
+    '[$([!#6;!#1;!H0]~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[CH2;R]1)]',
+    0),
+    'MACCSFP91': (
+    '[$([!#6;!#1;!H0]~*~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~*~[R]1@[R]@[CH2;R]1)]',
+    0),
+    'MACCSFP92': ('[#8]~[#6](~[#7])~[#6]', 0),
+    'MACCSFP93': ('[!#6;!#1]~[CH3]', 0),
+    'MACCSFP94': ('[!#6;!#1]~[#7]', 0),
+    'MACCSFP95': ('[#7]~*~*~[#8]', 0),
+    'MACCSFP96': ('*1~*~*~*~*~1', 0),
+    'MACCSFP97': ('[#7]~*~*~*~[#8]', 0),
+    'MACCSFP98': ('[!#6;!#1]1~*~*~*~*~*~1', 0),
+    'MACCSFP99': ('[#6]=[#6]', 0),
+    'MACCSFP100': ('*~[CH2]~[#7]', 0),
+    'MACCSFP101': (
+    '[$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1)]',
+    0),
+    'MACCSFP102': ('[!#6;!#1]~[#8]', 0),
+    'MACCSFP103': ('Cl', 0),
+    'MACCSFP104': ('[!#6;!#1;!H0]~*~[CH2]~*', 0),
+    'MACCSFP105': ('*@*(@*)@*', 0),
+    'MACCSFP106': ('[!#6;!#1]~*(~[!#6;!#1])~[!#6;!#1]', 0),
+    'MACCSFP107': ('[F,Cl,Br,I]~*(~*)~*', 0),
+    'MACCSFP108': ('[CH3]~*~*~*~[CH2]~*', 0),
+    'MACCSFP109': ('*~[CH2]~[#8]', 0),
+    'MACCSFP110': ('[#7]~[#6]~[#8]', 0),
+    'MACCSFP111': ('[#7]~*~[CH2]~*', 0),
+    'MACCSFP112': ('*~*(~*)(~*)~*', 0),
+    'MACCSFP113': ('[#8]!:*:*', 0),
+    'MACCSFP114': ('[CH3]~[CH2]~*', 0),
+    'MACCSFP115': ('[CH3]~*~[CH2]~*', 0),
+    'MACCSFP116': ('[$([CH3]~*~*~[CH2]~*),$([CH3]~*1~*~[CH2]1)]', 0),
+    'MACCSFP117': ('[#7]~*~[#8]', 0),
+    'MACCSFP118': ('[$(*~[CH2]~[CH2]~*),$(*1~[CH2]~[CH2]1)]', 1),
+    'MACCSFP119': ('[#7]=*', 0),
+    'MACCSFP120': ('[!#6;R]', 1),
+    'MACCSFP121': ('[#7;R]', 0),
+    'MACCSFP122': ('*~[#7](~*)~*', 0),
+    'MACCSFP123': ('[#8]~[#6]~[#8]', 0),
+    'MACCSFP124': ('[!#6;!#1]~[!#6;!#1]', 0),
+    'MACCSFP125': ('?', 0),
+    'MACCSFP126': ('*!@[#8]!@*', 0),
+    'MACCSFP127': ('*@*!@[#8]', 1),
+    'MACCSFP128': (
+    '[$(*~[CH2]~*~*~*~[CH2]~*),$([R]1@[CH2;R]@[R]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[R]@[CH2;R]1),$(*~[CH2]~*~[R]1@[R]@[CH2;R]1)]',
+    0),
+    'MACCSFP129': ('[$(*~[CH2]~*~*~[CH2]~*),$([R]1@[CH2]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[CH2;R]1)]',
+                   0),
+    'MACCSFP130': ('[!#6;!#1]~[!#6;!#1]', 1),
+    'MACCSFP131': ('[!#6;!#1;!H0]', 1),
+    'MACCSFP132': ('[#8]~*~[CH2]~*', 0),
+    'MACCSFP133': ('*@*!@[#7]', 0),
+    'MACCSFP134': ('[F,Cl,Br,I]', 0),
+    'MACCSFP135': ('[#7]!:*:*', 0),
+    'MACCSFP136': ('[#8]=*', 1),
+    'MACCSFP137': ('[!C;!c;R]', 0),
+    'MACCSFP138': ('[!#6;!#1]~[CH2]~*', 1),
+    'MACCSFP139': ('[O;!H0]', 0),
+    'MACCSFP140': ('[#8]', 3),
+    'MACCSFP141': ('[CH3]', 2),
+    'MACCSFP142': ('[#7]', 1),
+    'MACCSFP143': ('*@*!@[#8]', 0),
+    'MACCSFP144': ('*!:*:*!:*', 0),
+    'MACCSFP145': ('*1~*~*~*~*~*~1', 1),
+    'MACCSFP146': ('[#8]', 2),
+    'MACCSFP147': ('[$(*~[CH2]~[CH2]~*),$([R]1@[CH2;R]@[CH2;R]1)]', 0),
+    'MACCSFP148': ('*~[!#6;!#1](~*)~*', 0),
+    'MACCSFP149': ('[C;H3,H4]', 1),
+    'MACCSFP150': ('*!@*@*!@*', 0),
+    'MACCSFP151': ('[#7;!H0]', 0),
+    'MACCSFP152': ('[#8]~[#6](~[#6])~[#6]', 0),
+    'MACCSFP153': ('[!#6;!#1]~[CH2]~*', 0),
+    'MACCSFP154': ('[#6]=[#8]', 0),
+    'MACCSFP155': ('*!@[CH2]!@*', 0),
+    'MACCSFP156': ('[#7]~*(~*)~*', 0),
+    'MACCSFP157': ('[#6]-[#8]', 0),
+    'MACCSFP158': ('[#6]-[#7]', 0),
+    'MACCSFP159': ('[#8]', 1),
+    'MACCSFP160': ('[C;H3,H4]', 0),
+    'MACCSFP161': ('[#7]', 0),
+    'MACCSFP162': ('a', 0),
+    'MACCSFP163': ('*1~*~*~*~*~*~1', 0),
+    'MACCSFP164': ('[#8]', 0),
+    'MACCSFP165': ('[R]', 0),
+    'MACCSFP166': ('?', 0)}

deepscreen/data/featurizers/fingerprint/smarts_pharmacophore.py ADDED Viewed

	@@ -0,0 +1,21 @@

+Donor = ["[N;!H0;v3,v4&+1]", "[O,S;H1;+0]", "[n&H1&+0]"]
+Acceptor = ["[O,S;H1;v2;!$(*-*=[O,N,P,S])]", "[O;H0;v2]", "[O,S;v1;-]",
+            "[N;v3;!$(N-*=[O,N,P,S])]", "[n&H0&+0]", "[o;+0;!$([o]:n);!$([o]:c:n)]"]
+Positive = ["[#7;+]", "[N;H2&+0][$([C,a]);!$([C,a](=O))]",
+            "[N;H1&+0]([$([C,a]);!$([C,a](=O))])[$([C,a]);!$([C,a](=O))]",
+            "[N;H0&+0]([C;!$(C(=O))])([C;!$(C(=O))])[C;!$(C(=O))]"]
+Negative = ["[C,S](=[O,S,P])-[O;H1,-1]"]
+Hydrophobic = ["[C;D3,D4](-[CH3])-[CH3]", "[S;D2](-C)-C"]
+Aromatic = ["a"]
+pharmacophore_smarts = {"Donor": Donor,
+                        "Acceptor": Acceptor,
+                        "Positive": Positive,
+                        "Negative": Negative,
+                        "Hydrophobic": Hydrophobic,
+                        "Aromatic": Aromatic}

deepscreen/data/featurizers/fingerprint/smarts_pubchem.py ADDED Viewed

	@@ -0,0 +1,734 @@

+smartsPatts = {
+    'PubChemFP0': ('[H]', 3),
+    'PubChemFP1': ('[H]', 7),
+    'PubChemFP2': ('[H]', 15),
+    'PubChemFP3': ('[H]', 31),
+    'PubChemFP4': ('[Li]', 0),
+    'PubChemFP5': ('[Li]', 1),
+    'PubChemFP6': ('[B]', 0),
+    'PubChemFP7': ('[B]', 1),
+    'PubChemFP8': ('[B]', 3),
+    'PubChemFP9': ('[C]', 1),
+    'PubChemFP10': ('[C]', 3),
+    'PubChemFP11': ('[C]', 7),
+    'PubChemFP12': ('[C]', 15),
+    'PubChemFP13': ('[C]', 31),
+    'PubChemFP14': ('[N]', 0),
+    'PubChemFP15': ('[N]', 1),
+    'PubChemFP16': ('[N]', 3),
+    'PubChemFP17': ('[N]', 7),
+    'PubChemFP18': ('[O]', 0),
+    'PubChemFP19': ('[O]', 1),
+    'PubChemFP20': ('[O]', 3),
+    'PubChemFP21': ('[O]', 7),
+    'PubChemFP22': ('[O]', 15),
+    'PubChemFP23': ('[F]', 0),
+    'PubChemFP24': ('[F]', 1),
+    'PubChemFP25': ('[F]', 3),
+    'PubChemFP26': ('[Na]', 0),
+    'PubChemFP27': ('[Na]', 1),
+    'PubChemFP28': ('[Si]', 0),
+    'PubChemFP29': ('[Si]', 1),
+    'PubChemFP30': ('[P]', 0),
+    'PubChemFP31': ('[P]', 1),
+    'PubChemFP32': ('[P]', 3),
+    'PubChemFP33': ('[S]', 0),
+    'PubChemFP34': ('[S]', 1),
+    'PubChemFP35': ('[S]', 3),
+    'PubChemFP36': ('[S]', 7),
+    'PubChemFP37': ('[Cl]', 0),
+    'PubChemFP38': ('[Cl]', 1),
+    'PubChemFP39': ('[Cl]', 3),
+    'PubChemFP40': ('[Cl]', 7),
+    'PubChemFP41': ('[K]', 0),
+    'PubChemFP42': ('[K]', 1),
+    'PubChemFP43': ('[Br]', 0),
+    'PubChemFP44': ('[Br]', 1),
+    'PubChemFP45': ('[Br]', 3),
+    'PubChemFP46': ('[I]', 0),
+    'PubChemFP47': ('[I]', 1),
+    'PubChemFP48': ('[I]', 3),
+    'PubChemFP49': ('[Be]', 0),
+    'PubChemFP50': ('[Mg]', 0),
+    'PubChemFP51': ('[Al]', 0),
+    'PubChemFP52': ('[Ca]', 0),
+    'PubChemFP53': ('[Sc]', 0),
+    'PubChemFP54': ('[Ti]', 0),
+    'PubChemFP55': ('[V]', 0),
+    'PubChemFP56': ('[Cr]', 0),
+    'PubChemFP57': ('[Mn]', 0),
+    'PubChemFP58': ('[Fe]', 0),
+    'PubChemFP59': ('[CO]', 0),
+    'PubChemFP60': ('[Ni]', 0),
+    'PubChemFP61': ('[Cu]', 0),
+    'PubChemFP62': ('[Zn]', 0),
+    'PubChemFP63': ('[Ga]', 0),
+    'PubChemFP64': ('[Ge]', 0),
+    'PubChemFP65': ('[As]', 0),
+    'PubChemFP66': ('[Se]', 0),
+    'PubChemFP67': ('[Kr]', 0),
+    'PubChemFP68': ('[Rb]', 0),
+    'PubChemFP69': ('[Sr]', 0),
+    'PubChemFP70': ('[Y]', 0),
+    'PubChemFP71': ('[Zr]', 0),
+    'PubChemFP72': ('[Nb]', 0),
+    'PubChemFP73': ('[Mo]', 0),
+    'PubChemFP74': ('[Ru]', 0),
+    'PubChemFP75': ('[Rh]', 0),
+    'PubChemFP76': ('[Pd]', 0),
+    'PubChemFP77': ('[Ag]', 0),
+    'PubChemFP78': ('[Cd]', 0),
+    'PubChemFP79': ('[In]', 0),
+    'PubChemFP80': ('[Sn]', 0),
+    'PubChemFP81': ('[Sb]', 0),
+    'PubChemFP82': ('[Te]', 0),
+    'PubChemFP83': ('[Xe]', 0),
+    'PubChemFP84': ('[Cs]', 0),
+    'PubChemFP85': ('[Ba]', 0),
+    'PubChemFP86': ('[Lu]', 0),
+    'PubChemFP87': ('[Hf]', 0),
+    'PubChemFP88': ('[Ta]', 0),
+    'PubChemFP89': ('[W]', 0),
+    'PubChemFP90': ('[Re]', 0),
+    'PubChemFP91': ('[Os]', 0),
+    'PubChemFP92': ('[Ir]', 0),
+    'PubChemFP93': ('[Pt]', 0),
+    'PubChemFP94': ('[Au]', 0),
+    'PubChemFP95': ('[Hg]', 0),
+    'PubChemFP96': ('[Tl]', 0),
+    'PubChemFP97': ('[Pb]', 0),
+    'PubChemFP98': ('[Bi]', 0),
+    'PubChemFP99': ('[La]', 0),
+    'PubChemFP100': ('[Ce]', 0),
+    'PubChemFP101': ('[Pr]', 0),
+    'PubChemFP102': ('[Nd]', 0),
+    'PubChemFP103': ('[Pm]', 0),
+    'PubChemFP104': ('[Sm]', 0),
+    'PubChemFP105': ('[Eu]', 0),
+    'PubChemFP106': ('[Gd]', 0),
+    'PubChemFP107': ('[Tb]', 0),
+    'PubChemFP108': ('[Dy]', 0),
+    'PubChemFP109': ('[Ho]', 0),
+    'PubChemFP110': ('[Er]', 0),
+    'PubChemFP111': ('[Tm]', 0),
+    'PubChemFP112': ('[Yb]', 0),
+    'PubChemFP113': ('[Tc]', 0),
+    'PubChemFP114': ('[U]', 0),
+    'PubChemFP263': ('[Li&!H0]', 0),
+    'PubChemFP264': ('[Li]~[Li]', 0),
+    'PubChemFP265': ('[Li]~[#5]', 0),
+    'PubChemFP266': ('[Li]~[#6]', 0),
+    'PubChemFP267': ('[Li]~[#8]', 0),
+    'PubChemFP268': ('[Li]~[F]', 0),
+    'PubChemFP269': ('[Li]~[#15]', 0),
+    'PubChemFP270': ('[Li]~[#16]', 0),
+    'PubChemFP271': ('[Li]~[Cl]', 0),
+    'PubChemFP272': ('[#5&!H0]', 0),
+    'PubChemFP273': ('[#5]~[#5]', 0),
+    'PubChemFP274': ('[#5]~[#6]', 0),
+    'PubChemFP275': ('[#5]~[#7]', 0),
+    'PubChemFP276': ('[#5]~[#8]', 0),
+    'PubChemFP277': ('[#5]~[F]', 0),
+    'PubChemFP278': ('[#5]~[#14]', 0),
+    'PubChemFP279': ('[#5]~[#15]', 0),
+    'PubChemFP280': ('[#5]~[#16]', 0),
+    'PubChemFP281': ('[#5]~[Cl]', 0),
+    'PubChemFP282': ('[#5]~[Br]', 0),
+    'PubChemFP283': ('[#6&!H0]', 0),
+    'PubChemFP284': ('[#6]~[#6]', 0),
+    'PubChemFP285': ('[#6]~[#7]', 0),
+    'PubChemFP286': ('[#6]~[#8]', 0),
+    'PubChemFP287': ('[#6]~[F]', 0),
+    'PubChemFP288': ('[#6]~[Na]', 0),
+    'PubChemFP289': ('[#6]~[Mg]', 0),
+    'PubChemFP290': ('[#6]~[Al]', 0),
+    'PubChemFP291': ('[#6]~[#14]', 0),
+    'PubChemFP292': ('[#6]~[#15]', 0),
+    'PubChemFP293': ('[#6]~[#16]', 0),
+    'PubChemFP294': ('[#6]~[Cl]', 0),
+    'PubChemFP295': ('[#6]~[#33]', 0),
+    'PubChemFP296': ('[#6]~[#34]', 0),
+    'PubChemFP297': ('[#6]~[Br]', 0),
+    'PubChemFP298': ('[#6]~[I]', 0),
+    'PubChemFP299': ('[#7&!H0]', 0),
+    'PubChemFP300': ('[#7]~[#7]', 0),
+    'PubChemFP301': ('[#7]~[#8]', 0),
+    'PubChemFP302': ('[#7]~[F]', 0),
+    'PubChemFP303': ('[#7]~[#14]', 0),
+    'PubChemFP304': ('[#7]~[#15]', 0),
+    'PubChemFP305': ('[#7]~[#16]', 0),
+    'PubChemFP306': ('[#7]~[Cl]', 0),
+    'PubChemFP307': ('[#7]~[Br]', 0),
+    'PubChemFP308': ('[#8&!H0]', 0),
+    'PubChemFP309': ('[#8]~[#8]', 0),
+    'PubChemFP310': ('[#8]~[Mg]', 0),
+    'PubChemFP311': ('[#8]~[Na]', 0),
+    'PubChemFP312': ('[#8]~[Al]', 0),
+    'PubChemFP313': ('[#8]~[#14]', 0),
+    'PubChemFP314': ('[#8]~[#15]', 0),
+    'PubChemFP315': ('[#8]~[K]', 0),
+    'PubChemFP316': ('[F]~[#15]', 0),
+    'PubChemFP317': ('[F]~[#16]', 0),
+    'PubChemFP318': ('[Al&!H0]', 0),
+    'PubChemFP319': ('[Al]~[Cl]', 0),
+    'PubChemFP320': ('[#14&!H0]', 0),
+    'PubChemFP321': ('[#14]~[#14]', 0),
+    'PubChemFP322': ('[#14]~[Cl]', 0),
+    'PubChemFP323': ('[#15&!H0]', 0),
+    'PubChemFP324': ('[#15]~[#15]', 0),
+    'PubChemFP325': ('[#33&!H0]', 0),
+    'PubChemFP326': ('[#33]~[#33]', 0),
+    'PubChemFP327': ('[#6](~Br)(~[#6])', 0),
+    'PubChemFP328': ('[#6](~Br)(~[#6])(~[#6])', 0),
+    'PubChemFP329': ('[#6&!H0]~[Br]', 0),
+    'PubChemFP330': ('[#6](~[Br])(:[c])', 0),
+    'PubChemFP331': ('[#6](~[Br])(:[n])', 0),
+    'PubChemFP332': ('[#6](~[#6])(~[#6])', 0),
+    'PubChemFP333': ('[#6](~[#6])(~[#6])(~[#6])', 0),
+    'PubChemFP334': ('[#6](~[#6])(~[#6])(~[#6])(~[#6])', 0),
+    'PubChemFP335': ('[#6H1](~[#6])(~[#6])(~[#6])', 0),
+    'PubChemFP336': ('[#6](~[#6])(~[#6])(~[#6])(~[#7])', 0),
+    'PubChemFP337': ('[#6](~[#6])(~[#6])(~[#6])(~[#8])', 0),
+    'PubChemFP338': ('[#6H1](~[#6])(~[#6])(~[#7])', 0),
+    'PubChemFP339': ('[#6H1](~[#6])(~[#6])(~[#8])', 0),
+    'PubChemFP340': ('[#6](~[#6])(~[#6])(~[#7])', 0),
+    'PubChemFP341': ('[#6](~[#6])(~[#6])(~[#8])', 0),
+    'PubChemFP342': ('[#6](~[#6])(~[Cl])', 0),
+    'PubChemFP343': ('[#6&!H0](~[#6])(~[Cl])', 0),
+    'PubChemFP344': ('[#6H,#6H2,#6H3,#6H4]~[#6]', 0),
+    'PubChemFP345': ('[#6&!H0](~[#6])(~[#7])', 0),
+    'PubChemFP346': ('[#6&!H0](~[#6])(~[#8])', 0),
+    'PubChemFP347': ('[#6H1](~[#6])(~[#8])(~[#8])', 0),
+    'PubChemFP348': ('[#6&!H0](~[#6])(~[#15])', 0),
+    'PubChemFP349': ('[#6&!H0](~[#6])(~[#16])', 0),
+    'PubChemFP350': ('[#6](~[#6])(~[I])', 0),
+    'PubChemFP351': ('[#6](~[#6])(~[#7])', 0),
+    'PubChemFP352': ('[#6](~[#6])(~[#8])', 0),
+    'PubChemFP353': ('[#6](~[#6])(~[#16])', 0),
+    'PubChemFP354': ('[#6](~[#6])(~[#14])', 0),
+    'PubChemFP355': ('[#6](~[#6])(:c)', 0),
+    'PubChemFP356': ('[#6](~[#6])(:c)(:c)', 0),
+    'PubChemFP357': ('[#6](~[#6])(:c)(:n)', 0),
+    'PubChemFP358': ('[#6](~[#6])(:n)', 0),
+    'PubChemFP359': ('[#6](~[#6])(:n)(:n)', 0),
+    'PubChemFP360': ('[#6](~[Cl])(~[Cl])', 0),
+    'PubChemFP361': ('[#6&!H0](~[Cl])', 0),
+    'PubChemFP362': ('[#6](~[Cl])(:c)', 0),
+    'PubChemFP363': ('[#6](~[F])(~[F])', 0),
+    'PubChemFP364': ('[#6](~[F])(:c)', 0),
+    'PubChemFP365': ('[#6&!H0](~[#7])', 0),
+    'PubChemFP366': ('[#6&!H0](~[#8])', 0),
+    'PubChemFP367': ('[#6&!H0](~[#8])(~[#8])', 0),
+    'PubChemFP368': ('[#6&!H0](~[#16])', 0),
+    'PubChemFP369': ('[#6&!H0](~[#14])', 0),
+    'PubChemFP370': ('[#6&!H0]:c', 0),
+    'PubChemFP371': ('[#6&!H0](:c)(:c)', 0),
+    'PubChemFP372': ('[#6&!H0](:c)(:n)', 0),
+    'PubChemFP373': ('[#6&!H0](:n)', 0),
+    'PubChemFP374': ('[#6H3]', 0),
+    'PubChemFP375': ('[#6](~[#7])(~[#7])', 0),
+    'PubChemFP376': ('[#6](~[#7])(:c)', 0),
+    'PubChemFP377': ('[#6](~[#7])(:c)(:c)', 0),
+    'PubChemFP378': ('[#6](~[#7])(:c)(:n)', 0),
+    'PubChemFP379': ('[#6](~[#7])(:n)', 0),
+    'PubChemFP380': ('[#6](~[#8])(~[#8])', 0),
+    'PubChemFP381': ('[#6](~[#8])(:c)', 0),
+    'PubChemFP382': ('[#6](~[#8])(:c)(:c)', 0),
+    'PubChemFP383': ('[#6](~[#16])(:c)', 0),
+    'PubChemFP384': ('[#6](:c)(:c)', 0),
+    'PubChemFP385': ('[#6](:c)(:c)(:c)', 0),
+    'PubChemFP386': ('[#6](:c)(:c)(:n)', 0),
+    'PubChemFP387': ('[#6](:c)(:n)', 0),
+    'PubChemFP388': ('[#6](:c)(:n)(:n)', 0),
+    'PubChemFP389': ('[#6](:n)(:n)', 0),
+    'PubChemFP390': ('[#7](~[#6])(~[#6])', 0),
+    'PubChemFP391': ('[#7](~[#6])(~[#6])(~[#6])', 0),
+    'PubChemFP392': ('[#7&!H0](~[#6])(~[#6])', 0),
+    'PubChemFP393': ('[#7&!H0](~[#6])', 0),
+    'PubChemFP394': ('[#7&!H0](~[#6])(~[#7])', 0),
+    'PubChemFP395': ('[#7](~[#6])(~[#8])', 0),
+    'PubChemFP396': ('[#7](~[#6])(:c)', 0),
+    'PubChemFP397': ('[#7](~[#6])(:c)(:c)', 0),
+    'PubChemFP398': ('[#7&!H0](~[#7])', 0),
+    'PubChemFP399': ('[#7&!H0](:c)', 0),
+    'PubChemFP400': ('[#7&!H0](:c)(:c)', 0),
+    'PubChemFP401': ('[#7](~[#8])(~[#8])', 0),
+    'PubChemFP402': ('[#7](~[#8])(:o)', 0),
+    'PubChemFP403': ('[#7](:c)(:c)', 0),
+    'PubChemFP404': ('[#7](:c)(:c)(:c)', 0),
+    'PubChemFP405': ('[#8](~[#6])(~[#6])', 0),
+    'PubChemFP406': ('[#8&!H0](~[#6])', 0),
+    'PubChemFP407': ('[#8](~[#6])(~[#15])', 0),
+    'PubChemFP408': ('[#8&!H0](~[#16])', 0),
+    'PubChemFP409': ('[#8](:c)(:c)', 0),
+    'PubChemFP410': ('[#15](~[#6])(~[#6])', 0),
+    'PubChemFP411': ('[#15](~[#8])(~[#8])', 0),
+    'PubChemFP412': ('[#16](~[#6])(~[#6])', 0),
+    'PubChemFP413': ('[#16&!H0](~[#6])', 0),
+    'PubChemFP414': ('[#16](~[#6])(~[#8])', 0),
+    'PubChemFP415': ('[#14](~[#6])(~[#6])', 0),
+    'PubChemFP416': ('[#6]=,:[#6]', 0),
+    'PubChemFP417': ('[#6]#[#6]', 0),
+    'PubChemFP418': ('[#6]=,:[#7]', 0),
+    'PubChemFP419': ('[#6]#[#7]', 0),
+    'PubChemFP420': ('[#6]=,:[#8]', 0),
+    'PubChemFP421': ('[#6]=,:[#16]', 0),
+    'PubChemFP422': ('[#7]=,:[#7]', 0),
+    'PubChemFP423': ('[#7]=,:[#8]', 0),
+    'PubChemFP424': ('[#7]=,:[#15]', 0),
+    'PubChemFP425': ('[#15]=,:[#8]', 0),
+    'PubChemFP426': ('[#15]=,:[#15]', 0),
+    'PubChemFP427': ('[#6](#[#6])(-,:[#6])', 0),
+    'PubChemFP428': ('[#6&!H0](#[#6])', 0),
+    'PubChemFP429': ('[#6](#[#7])(-,:[#6])', 0),
+    'PubChemFP430': ('[#6](-,:[#6])(-,:[#6])(=,:[#6])', 0),
+    'PubChemFP431': ('[#6](-,:[#6])(-,:[#6])(=,:[#7])', 0),
+    'PubChemFP432': ('[#6](-,:[#6])(-,:[#6])(=,:[#8])', 0),
+    'PubChemFP433': ('[#6](-,:[#6])([Cl])(=,:[#8])', 0),
+    'PubChemFP434': ('[#6&!H0](-,:[#6])(=,:[#6])', 0),
+    'PubChemFP435': ('[#6&!H0](-,:[#6])(=,:[#7])', 0),
+    'PubChemFP436': ('[#6&!H0](-,:[#6])(=,:[#8])', 0),
+    'PubChemFP437': ('[#6](-,:[#6])(-,:[#7])(=,:[#6])', 0),
+    'PubChemFP438': ('[#6](-,:[#6])(-,:[#7])(=,:[#7])', 0),
+    'PubChemFP439': ('[#6](-,:[#6])(-,:[#7])(=,:[#8])', 0),
+    'PubChemFP440': ('[#6](-,:[#6])(-,:[#8])(=,:[#8])', 0),
+    'PubChemFP441': ('[#6](-,:[#6])(=,:[#6])', 0),
+    'PubChemFP442': ('[#6](-,:[#6])(=,:[#7])', 0),
+    'PubChemFP443': ('[#6](-,:[#6])(=,:[#8])', 0),
+    'PubChemFP444': ('[#6]([Cl])(=,:[#8])', 0),
+    'PubChemFP445': ('[#6&!H0](-,:[#7])(=,:[#6])', 0),
+    'PubChemFP446': ('[#6&!H0](=,:[#6])', 0),
+    'PubChemFP447': ('[#6&!H0](=,:[#7])', 0),
+    'PubChemFP448': ('[#6&!H0](=,:[#8])', 0),
+    'PubChemFP449': ('[#6](-,:[#7])(=,:[#6])', 0),
+    'PubChemFP450': ('[#6](-,:[#7])(=,:[#7])', 0),
+    'PubChemFP451': ('[#6](-,:[#7])(=,:[#8])', 0),
+    'PubChemFP452': ('[#6](-,:[#8])(=,:[#8])', 0),
+    'PubChemFP453': ('[#7](-,:[#6])(=,:[#6])', 0),
+    'PubChemFP454': ('[#7](-,:[#6])(=,:[#8])', 0),
+    'PubChemFP455': ('[#7](-,:[#8])(=,:[#8])', 0),
+    'PubChemFP456': ('[#15](-,:[#8])(=,:[#8])', 0),
+    'PubChemFP457': ('[#16](-,:[#6])(=,:[#8])', 0),
+    'PubChemFP458': ('[#16](-,:[#8])(=,:[#8])', 0),
+    'PubChemFP459': ('[#16](=,:[#8])(=,:[#8])', 0),
+    'PubChemFP460': ('[#6]-,:[#6]-,:[#6]#[#6]', 0),
+    'PubChemFP461': ('[#8]-,:[#6]-,:[#6]=,:[#7]', 0),
+    'PubChemFP462': ('[#8]-,:[#6]-,:[#6]=,:[#8]', 0),
+    'PubChemFP463': ('[#7]:[#6]-,:[#16&!H0]', 0),
+    'PubChemFP464': ('[#7]-,:[#6]-,:[#6]=,:[#6]', 0),
+    'PubChemFP465': ('[#8]=,:[#16]-,:[#6]-,:[#6]', 0),
+    'PubChemFP466': ('[#7]#[#6]-,:[#6]=,:[#6]', 0),
+    'PubChemFP467': ('[#6]=,:[#7]-,:[#7]-,:[#6]', 0),
+    'PubChemFP468': ('[#8]=,:[#16]-,:[#6]-,:[#7]', 0),
+    'PubChemFP469': ('[#16]-,:[#16]-,:[#6]:[#6]', 0),
+    'PubChemFP470': ('[#6]:[#6]-,:[#6]=,:[#6]', 0),
+    'PubChemFP471': ('[#16]:[#6]:[#6]:[#6]', 0),
+    'PubChemFP472': ('[#6]:[#7]:[#6]-,:[#6]', 0),
+    'PubChemFP473': ('[#16]-,:[#6]:[#7]:[#6]', 0),
+    'PubChemFP474': ('[#16]:[#6]:[#6]:[#7]', 0),
+    'PubChemFP475': ('[#16]-,:[#6]=,:[#7]-,:[#6]', 0),
+    'PubChemFP476': ('[#6]-,:[#8]-,:[#6]=,:[#6]', 0),
+    'PubChemFP477': ('[#7]-,:[#7]-,:[#6]:[#6]', 0),
+    'PubChemFP478': ('[#16]-,:[#6]=,:[#7&!H0]', 0),
+    'PubChemFP479': ('[#16]-,:[#6]-,:[#16]-,:[#6]', 0),
+    'PubChemFP480': ('[#6]:[#16]:[#6]-,:[#6]', 0),
+    'PubChemFP481': ('[#8]-,:[#16]-,:[#6]:[#6]', 0),
+    'PubChemFP482': ('[#6]:[#7]-,:[#6]:[#6]', 0),
+    'PubChemFP483': ('[#7]-,:[#16]-,:[#6]:[#6]', 0),
+    'PubChemFP484': ('[#7]-,:[#6]:[#7]:[#6]', 0),
+    'PubChemFP485': ('[#7]:[#6]:[#6]:[#7]', 0),
+    'PubChemFP486': ('[#7]-,:[#6]:[#7]:[#7]', 0),
+    'PubChemFP487': ('[#7]-,:[#6]=,:[#7]-,:[#6]', 0),
+    'PubChemFP488': ('[#7]-,:[#6]=,:[#7&!H0]', 0),
+    'PubChemFP489': ('[#7]-,:[#6]-,:[#16]-,:[#6]', 0),
+    'PubChemFP490': ('[#6]-,:[#6]-,:[#6]=,:[#6]', 0),
+    'PubChemFP491': ('[#6]-,:[#7]:[#6&!H0]', 0),
+    'PubChemFP492': ('[#7]-,:[#6]:[#8]:[#6]', 0),
+    'PubChemFP493': ('[#8]=,:[#6]-,:[#6]:[#6]', 0),
+    'PubChemFP494': ('[#8]=,:[#6]-,:[#6]:[#7]', 0),
+    'PubChemFP495': ('[#6]-,:[#7]-,:[#6]:[#6]', 0),
+    'PubChemFP496': ('[#7]:[#7]-,:[#6&!H0]', 0),
+    'PubChemFP497': ('[#8]-,:[#6]:[#6]:[#7]', 0),
+    'PubChemFP498': ('[#8]-,:[#6]=,:[#6]-,:[#6]', 0),
+    'PubChemFP499': ('[#7]-,:[#6]:[#6]:[#7]', 0),
+    'PubChemFP500': ('[#6]-,:[#16]-,:[#6]:[#6]', 0),
+    'PubChemFP501': ('[Cl]-,:[#6]:[#6]-,:[#6]', 0),
+    'PubChemFP502': ('[#7]-,:[#6]=,:[#6&!H0]', 0),
+    'PubChemFP503': ('[Cl]-,:[#6]:[#6&!H0]', 0),
+    'PubChemFP504': ('[#7]:[#6]:[#7]-,:[#6]', 0),
+    'PubChemFP505': ('[Cl]-,:[#6]:[#6]-,:[#8]', 0),
+    'PubChemFP506': ('[#6]-,:[#6]:[#7]:[#6]', 0),
+    'PubChemFP507': ('[#6]-,:[#6]-,:[#16]-,:[#6]', 0),
+    'PubChemFP508': ('[#16]=,:[#6]-,:[#7]-,:[#6]', 0),
+    'PubChemFP509': ('[Br]-,:[#6]:[#6]-,:[#6]', 0),
+    'PubChemFP510': ('[#7&!H0]-,:[#7&!H0]', 0),
+    'PubChemFP511': ('[#16]=,:[#6]-,:[#7&!H0]', 0),
+    'PubChemFP512': ('[#6]-,:[#33]-[#8&!H0]', 0),
+    'PubChemFP513': ('[#16]:[#6]:[#6&!H0]', 0),
+    'PubChemFP514': ('[#8]-,:[#7]-,:[#6]-,:[#6]', 0),
+    'PubChemFP515': ('[#7]-,:[#7]-,:[#6]-,:[#6]', 0),
+    'PubChemFP516': ('[#6H,#6H2,#6H3]=,:[#6H,#6H2,#6H3]', 0),
+    'PubChemFP517': ('[#7]-,:[#7]-,:[#6]-,:[#7]', 0),
+    'PubChemFP518': ('[#8]=,:[#6]-,:[#7]-,:[#7]', 0),
+    'PubChemFP519': ('[#7]=,:[#6]-,:[#7]-,:[#6]', 0),
+    'PubChemFP520': ('[#6]=,:[#6]-,:[#6]:[#6]', 0),
+    'PubChemFP521': ('[#6]:[#7]-,:[#6&!H0]', 0),
+    'PubChemFP522': ('[#6]-,:[#7]-,:[#7&!H0]', 0),
+    'PubChemFP523': ('[#7]:[#6]:[#6]-,:[#6]', 0),
+    'PubChemFP524': ('[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
+    'PubChemFP525': ('[#33]-,:[#6]:[#6&!H0]', 0),
+    'PubChemFP526': ('[Cl]-,:[#6]:[#6]-,:[Cl]', 0),
+    'PubChemFP527': ('[#6]:[#6]:[#7&!H0]', 0),
+    'PubChemFP528': ('[#7&!H0]-,:[#6&!H0]', 0),
+    'PubChemFP529': ('[Cl]-,:[#6]-,:[#6]-,:[Cl]', 0),
+    'PubChemFP530': ('[#7]:[#6]-,:[#6]:[#6]', 0),
+    'PubChemFP531': ('[#16]-,:[#6]:[#6]-,:[#6]', 0),
+    'PubChemFP532': ('[#16]-,:[#6]:[#6&!H0]', 0),
+    'PubChemFP533': ('[#16]-,:[#6]:[#6]-,:[#7]', 0),
+    'PubChemFP534': ('[#16]-,:[#6]:[#6]-,:[#8]', 0),
+    'PubChemFP535': ('[#8]=,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP536': ('[#8]=,:[#6]-,:[#6]-,:[#7]', 0),
+    'PubChemFP537': ('[#8]=,:[#6]-,:[#6]-,:[#8]', 0),
+    'PubChemFP538': ('[#7]=,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP539': ('[#7]=,:[#6]-,:[#6&!H0]', 0),
+    'PubChemFP540': ('[#6]-,:[#7]-,:[#6&!H0]', 0),
+    'PubChemFP541': ('[#8]-,:[#6]:[#6]-,:[#6]', 0),
+    'PubChemFP542': ('[#8]-,:[#6]:[#6&!H0]', 0),
+    'PubChemFP543': ('[#8]-,:[#6]:[#6]-,:[#7]', 0),
+    'PubChemFP544': ('[#8]-,:[#6]:[#6]-,:[#8]', 0),
+    'PubChemFP545': ('[#7]-,:[#6]:[#6]-,:[#6]', 0),
+    'PubChemFP546': ('[#7]-,:[#6]:[#6&!H0]', 0),
+    'PubChemFP547': ('[#7]-,:[#6]:[#6]-,:[#7]', 0),
+    'PubChemFP548': ('[#8]-,:[#6]-,:[#6]:[#6]', 0),
+    'PubChemFP549': ('[#7]-,:[#6]-,:[#6]:[#6]', 0),
+    'PubChemFP550': ('[Cl]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP551': ('[Cl]-,:[#6]-,:[#6]-,:[#8]', 0),
+    'PubChemFP552': ('[#6]:[#6]-,:[#6]:[#6]', 0),
+    'PubChemFP553': ('[#8]=,:[#6]-,:[#6]=,:[#6]', 0),
+    'PubChemFP554': ('[Br]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP555': ('[#7]=,:[#6]-,:[#6]=,:[#6]', 0),
+    'PubChemFP556': ('[#6]=,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP557': ('[#7]:[#6]-,:[#8&!H0]', 0),
+    'PubChemFP558': ('[#8]=,:[#7]-,:c:c', 0),
+    'PubChemFP559': ('[#8]-,:[#6]-,:[#7&!H0]', 0),
+    'PubChemFP560': ('[#7]-,:[#6]-,:[#7]-,:[#6]', 0),
+    'PubChemFP561': ('[Cl]-,:[#6]-,:[#6]=,:[#8]', 0),
+    'PubChemFP562': ('[Br]-,:[#6]-,:[#6]=,:[#8]', 0),
+    'PubChemFP563': ('[#8]-,:[#6]-,:[#8]-,:[#6]', 0),
+    'PubChemFP564': ('[#6]=,:[#6]-,:[#6]=,:[#6]', 0),
+    'PubChemFP565': ('[#6]:[#6]-,:[#8]-,:[#6]', 0),
+    'PubChemFP566': ('[#8]-,:[#6]-,:[#6]-,:[#7]', 0),
+    'PubChemFP567': ('[#8]-,:[#6]-,:[#6]-,:[#8]', 0),
+    'PubChemFP568': ('N#[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP569': ('[#7]-,:[#6]-,:[#6]-,:[#7]', 0),
+    'PubChemFP570': ('[#6]:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP571': ('[#6&!H0]-,:[#8&!H0]', 0),
+    'PubChemFP572': ('n:c:n:c', 0),
+    'PubChemFP573': ('[#8]-,:[#6]-,:[#6]=,:[#6]', 0),
+    'PubChemFP574': ('[#8]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
+    'PubChemFP575': ('[#8]-,:[#6]-,:[#6]:[#6]-,:[#8]', 0),
+    'PubChemFP576': ('[#7]=,:[#6]-,:[#6]:[#6&!H0]', 0),
+    'PubChemFP577': ('c:c-,:[#7]-,:c:c', 0),
+    'PubChemFP578': ('[#6]-,:[#6]:[#6]-,:c:c', 0),
+    'PubChemFP579': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP580': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
+    'PubChemFP581': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
+    'PubChemFP582': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP583': ('[Cl]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
+    'PubChemFP584': ('c:c-,:[#6]=,:[#6]-,:[#6]', 0),
+    'PubChemFP585': ('[#6]-,:[#6]:[#6]-,:[#7]-,:[#6]', 0),
+    'PubChemFP586': ('[#6]-,:[#16]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP587': ('[#7]-,:[#6]:[#6]-,:[#8&!H0]', 0),
+    'PubChemFP588': ('[#8]=,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
+    'PubChemFP589': ('[#6]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
+    'PubChemFP590': ('[#6]-,:[#6]:[#6]-,:[#8&!H0]', 0),
+    'PubChemFP591': ('[Cl]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP592': ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP593': ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
+    'PubChemFP594': ('[#6]-,:[#8]-,:[#6]-,:[#6]=,:[#6]', 0),
+    'PubChemFP595': ('c:c-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP596': ('[#7]=,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
+    'PubChemFP597': ('[#8]=,:[#6]-,:[#6]-,:c:c', 0),
+    'PubChemFP598': ('[Cl]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
+    'PubChemFP599': ('[#6H,#6H2,#6H3]-,:[#6]=,:[#6H,#6H2,#6H3]', 0),
+    'PubChemFP600': ('[#7]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
+    'PubChemFP601': ('[#7]-,:[#6]:[#6]:[#6]-,:[#7]', 0),
+    'PubChemFP602': ('[#8]=,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
+    'PubChemFP603': ('[#6]-,:c:c:[#6]-,:[#6]', 0),
+    'PubChemFP604': ('[#6]-,:[#8]-,:[#6]-,:[#6]:c', 0),
+    'PubChemFP605': ('[#8]=,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
+    'PubChemFP606': ('[#8]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP607': ('[#7]-,:[#6]-,:[#6]-,:[#6]:c', 0),
+    'PubChemFP608': ('[#6]-,:[#6]-,:[#6]-,:[#6]:c', 0),
+    'PubChemFP609': ('[Cl]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
+    'PubChemFP610': ('[#6]-,:[#8]-,:[#6]-,:[#8]-,:[#6]', 0),
+    'PubChemFP611': ('[#7]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
+    'PubChemFP612': ('[#7]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
+    'PubChemFP613': ('[#6]-,:[#7]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP614': ('[#6]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
+    'PubChemFP615': ('[#7]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
+    'PubChemFP616': ('c:c:n:n:c', 0),
+    'PubChemFP617': ('[#6]-,:[#6]-,:[#6]-,:[#8&!H0]', 0),
+    'PubChemFP618': ('c:[#6]-,:[#6]-,:[#6]:c', 0),
+    'PubChemFP619': ('[#8]-,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
+    'PubChemFP620': ('c:c-,:[#8]-,:[#6]-,:[#6]', 0),
+    'PubChemFP621': ('[#7]-,:[#6]:c:c:n', 0),
+    'PubChemFP622': ('[#8]=,:[#6]-,:[#8]-,:[#6]:c', 0),
+    'PubChemFP623': ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
+    'PubChemFP624': ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#7]', 0),
+    'PubChemFP625': ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#8]', 0),
+    'PubChemFP626': ('[#6]-,:[#8]-,:[#6]:[#6]-,:[#6]', 0),
+    'PubChemFP627': ('[#8]=,:[#33]-,:[#6]:c:c', 0),
+    'PubChemFP628': ('[#6]-,:[#7]-,:[#6]-,:[#6]:c', 0),
+    'PubChemFP629': ('[#16]-,:[#6]:c:c-,:[#7]', 0),
+    'PubChemFP630': ('[#8]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
+    'PubChemFP631': ('[#8]-,:[#6]:[#6]-,:[#8&!H0]', 0),
+    'PubChemFP632': ('[#6]-,:[#6]-,:[#8]-,:[#6]:c', 0),
+    'PubChemFP633': ('[#7]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
+    'PubChemFP634': ('[#6]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
+    'PubChemFP635': ('[#7]-,:[#7]-,:[#6]-,:[#7&!H0]', 0),
+    'PubChemFP636': ('[#6]-,:[#7]-,:[#6]-,:[#7]-,:[#6]', 0),
+    'PubChemFP637': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP638': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
+    'PubChemFP639': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
+    'PubChemFP640': ('[#6]=,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP641': ('[#8]-,:[#6]-,:[#6]-,:[#6]=,:[#6]', 0),
+    'PubChemFP642': ('[#8]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
+    'PubChemFP643': ('[#6&!H0]-,:[#6]-,:[#7&!H0]', 0),
+    'PubChemFP644': ('[#6]-,:[#6]=,:[#7]-,:[#7]-,:[#6]', 0),
+    'PubChemFP645': ('[#8]=,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
+    'PubChemFP646': ('[#8]=,:[#6]-,:[#7]-,:[#6&!H0]', 0),
+    'PubChemFP647': ('[#8]=,:[#6]-,:[#7]-,:[#6]-,:[#7]', 0),
+    'PubChemFP648': ('[#8]=,:[#7]-,:[#6]:[#6]-,:[#7]', 0),
+    'PubChemFP649': ('[#8]=,:[#7]-,:c:c-,:[#8]', 0),
+    'PubChemFP650': ('[#8]=,:[#6]-,:[#7]-,:[#6]=,:[#8]', 0),
+    'PubChemFP651': ('[#8]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
+    'PubChemFP652': ('[#8]-,:[#6]:[#6]:[#6]-,:[#7]', 0),
+    'PubChemFP653': ('[#8]-,:[#6]:[#6]:[#6]-,:[#8]', 0),
+    'PubChemFP654': ('[#7]-,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
+    'PubChemFP655': ('[#8]-,:[#6]-,:[#6]-,:[#6]:c', 0),
+    'PubChemFP656': ('[#6]-,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
+    'PubChemFP657': ('[#6]-,:[#7]-,:[#6]:[#6]-,:[#6]', 0),
+    'PubChemFP658': ('[#6]-,:[#6]-,:[#16]-,:[#6]-,:[#6]', 0),
+    'PubChemFP659': ('[#8]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
+    'PubChemFP660': ('[#6]-,:[#6]=,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP661': ('[#8]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
+    'PubChemFP662': ('[#8]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
+    'PubChemFP663': ('[#8]-,:[#6]-,:[#6]-,:[#8&!H0]', 0),
+    'PubChemFP664': ('[#6]-,:[#6]=,:[#6]-,:[#6]=,:[#6]', 0),
+    'PubChemFP665': ('[#7]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP666': ('[#6]=,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
+    'PubChemFP667': ('[#6]=,:[#6]-,:[#6]-,:[#8&!H0]', 0),
+    'PubChemFP668': ('[#6]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP669': ('[Cl]-,:[#6]:[#6]-,:[#6]=,:[#8]', 0),
+    'PubChemFP670': ('[Br]-,:[#6]:c:c-,:[#6]', 0),
+    'PubChemFP671': ('[#8]=,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
+    'PubChemFP672': ('[#8]=,:[#6]-,:[#6]=,:[#6&!H0]', 0),
+    'PubChemFP673': ('[#8]=,:[#6]-,:[#6]=,:[#6]-,:[#7]', 0),
+    'PubChemFP674': ('[#7]-,:[#6]-,:[#7]-,:[#6]:c', 0),
+    'PubChemFP675': ('[Br]-,:[#6]-,:[#6]-,:[#6]:c', 0),
+    'PubChemFP676': ('[#7]#[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP677': ('[#6]-,:[#6]=,:[#6]-,:[#6]:c', 0),
+    'PubChemFP678': ('[#6]-,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
+    'PubChemFP679': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP680': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP681': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
+    'PubChemFP682': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
+    'PubChemFP683': ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP684': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP685': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
+    'PubChemFP686': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
+    'PubChemFP687': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
+    'PubChemFP688': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP689': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP690': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
+    'PubChemFP691': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
+    'PubChemFP692': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP693': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
+    'PubChemFP694': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
+    'PubChemFP695': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
+    'PubChemFP696': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP697': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]', 0),
+    'PubChemFP698': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP699': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]', 0),
+    'PubChemFP700': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
+    'PubChemFP701': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#8])-,:[#6]', 0),
+    'PubChemFP702': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
+    'PubChemFP703': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#7])-,:[#6]', 0),
+    'PubChemFP704': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP705': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#8])-,:[#6]', 0),
+    'PubChemFP706': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](=,:[#8])-,:[#6]', 0),
+    'PubChemFP707': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#7])-,:[#6]', 0),
+    'PubChemFP708': ('[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]', 0),
+    'PubChemFP709': ('[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]-,:[#6]', 0),
+    'PubChemFP710': ('[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]', 0),
+    'PubChemFP711': ('[#6]-,:[#6](-,:[#6])(-,:[#6])-,:[#6]-,:[#6]', 0),
+    'PubChemFP712': ('[#6]-,:[#6](-,:[#6])-,:[#6](-,:[#6])-,:[#6]', 0),
+    'PubChemFP713': ('[#6]c1ccc([#6])cc1', 0),
+    'PubChemFP714': ('[#6]c1ccc([#8])cc1', 0),
+    'PubChemFP715': ('[#6]c1ccc([#16])cc1', 0),
+    'PubChemFP716': ('[#6]c1ccc([#7])cc1', 0),
+    'PubChemFP717': ('[#6]c1ccc(Cl)cc1', 0),
+    'PubChemFP718': ('[#6]c1ccc(Br)cc1', 0),
+    'PubChemFP719': ('[#8]c1ccc([#8])cc1', 0),
+    'PubChemFP720': ('[#8]c1ccc([#16])cc1', 0),
+    'PubChemFP721': ('[#8]c1ccc([#7])cc1', 0),
+    'PubChemFP722': ('[#8]c1ccc(Cl)cc1', 0),
+    'PubChemFP723': ('[#8]c1ccc(Br)cc1', 0),
+    'PubChemFP724': ('[#16]c1ccc([#16])cc1', 0),
+    'PubChemFP725': ('[#16]c1ccc([#7])cc1', 0),
+    'PubChemFP726': ('[#16]c1ccc(Cl)cc1', 0),
+    'PubChemFP727': ('[#16]c1ccc(Br)cc1', 0),
+    'PubChemFP728': ('[#7]c1ccc([#7])cc1', 0),
+    'PubChemFP729': ('[#7]c1ccc(Cl)cc1', 0),
+    'PubChemFP730': ('[#7]c1ccc(Br)cc1', 0),
+    'PubChemFP731': ('Clc1ccc(Cl)cc1', 0),
+    'PubChemFP732': ('Clc1ccc(Br)cc1', 0),
+    'PubChemFP733': ('Brc1ccc(Br)cc1', 0),
+    'PubChemFP734': ('[#6]c1cc([#6])ccc1', 0),
+    'PubChemFP735': ('[#6]c1cc([#8])ccc1', 0),
+    'PubChemFP736': ('[#6]c1cc([#16])ccc1', 0),
+    'PubChemFP737': ('[#6]c1cc([#7])ccc1', 0),
+    'PubChemFP738': ('[#6]c1cc(Cl)ccc1', 0),
+    'PubChemFP739': ('[#6]c1cc(Br)ccc1', 0),
+    'PubChemFP740': ('[#8]c1cc([#8])ccc1', 0),
+    'PubChemFP741': ('[#8]c1cc([#16])ccc1', 0),
+    'PubChemFP742': ('[#8]c1cc([#7])ccc1', 0),
+    'PubChemFP743': ('[#8]c1cc(Cl)ccc1', 0),
+    'PubChemFP744': ('[#8]c1cc(Br)ccc1', 0),
+    'PubChemFP745': ('[#16]c1cc([#16])ccc1', 0),
+    'PubChemFP746': ('[#16]c1cc([#7])ccc1', 0),
+    'PubChemFP747': ('[#16]c1cc(Cl)ccc1', 0),
+    'PubChemFP748': ('[#16]c1cc(Br)ccc1', 0),
+    'PubChemFP749': ('[#7]c1cc([#7])ccc1', 0),
+    'PubChemFP750': ('[#7]c1cc(Cl)ccc1', 0),
+    'PubChemFP751': ('[#7]c1cc(Br)ccc1', 0),
+    'PubChemFP752': ('Clc1cc(Cl)ccc1', 0),
+    'PubChemFP753': ('Clc1cc(Br)ccc1', 0),
+    'PubChemFP754': ('Brc1cc(Br)ccc1', 0),
+    'PubChemFP755': ('[#6]c1c([#6])cccc1', 0),
+    'PubChemFP756': ('[#6]c1c([#8])cccc1', 0),
+    'PubChemFP757': ('[#6]c1c([#16])cccc1', 0),
+    'PubChemFP758': ('[#6]c1c([#7])cccc1', 0),
+    'PubChemFP759': ('[#6]c1c(Cl)cccc1', 0),
+    'PubChemFP760': ('[#6]c1c(Br)cccc1', 0),
+    'PubChemFP761': ('[#8]c1c([#8])cccc1', 0),
+    'PubChemFP762': ('[#8]c1c([#16])cccc1', 0),
+    'PubChemFP763': ('[#8]c1c([#7])cccc1', 0),
+    'PubChemFP764': ('[#8]c1c(Cl)cccc1', 0),
+    'PubChemFP765': ('[#8]c1c(Br)cccc1', 0),
+    'PubChemFP766': ('[#16]c1c([#16])cccc1', 0),
+    'PubChemFP767': ('[#16]c1c([#7])cccc1', 0),
+    'PubChemFP768': ('[#16]c1c(Cl)cccc1', 0),
+    'PubChemFP769': ('[#16]c1c(Br)cccc1', 0),
+    'PubChemFP770': ('[#7]c1c([#7])cccc1', 0),
+    'PubChemFP771': ('[#7]c1c(Cl)cccc1', 0),
+    'PubChemFP772': ('[#7]c1c(Br)cccc1', 0),
+    'PubChemFP773': ('Clc1c(Cl)cccc1', 0),
+    'PubChemFP774': ('Clc1c(Br)cccc1', 0),
+    'PubChemFP775': ('Brc1c(Br)cccc1', 0),
+    'PubChemFP776': ('[#6][#6]1[#6][#6][#6]([#6])[#6][#6]1', 0),
+    'PubChemFP777': ('[#6][#6]1[#6][#6][#6]([#8])[#6][#6]1', 0),
+    'PubChemFP778': ('[#6][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
+    'PubChemFP779': ('[#6][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
+    'PubChemFP780': ('[#6][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
+    'PubChemFP781': ('[#6][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
+    'PubChemFP782': ('[#8][#6]1[#6][#6][#6]([#8])[#6][#6]1', 0),
+    'PubChemFP783': ('[#8][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
+    'PubChemFP784': ('[#8][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
+    'PubChemFP785': ('[#8][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
+    'PubChemFP786': ('[#8][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
+    'PubChemFP787': ('[#16][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
+    'PubChemFP788': ('[#16][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
+    'PubChemFP789': ('[#16][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
+    'PubChemFP790': ('[#16][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
+    'PubChemFP791': ('[#7][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
+    'PubChemFP792': ('[#7][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
+    'PubChemFP793': ('[#7][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
+    'PubChemFP794': ('Cl[#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
+    'PubChemFP795': ('Cl[#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
+    'PubChemFP796': ('Br[#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
+    'PubChemFP797': ('[#6][#6]1[#6][#6]([#6])[#6][#6][#6]1', 0),
+    'PubChemFP798': ('[#6][#6]1[#6][#6]([#8])[#6][#6][#6]1', 0),
+    'PubChemFP799': ('[#6][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
+    'PubChemFP800': ('[#6][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
+    'PubChemFP801': ('[#6][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
+    'PubChemFP802': ('[#6][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
+    'PubChemFP803': ('[#8][#6]1[#6][#6]([#8])[#6][#6][#6]1', 0),
+    'PubChemFP804': ('[#8][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
+    'PubChemFP805': ('[#8][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
+    'PubChemFP806': ('[#8][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
+    'PubChemFP807': ('[#8][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
+    'PubChemFP808': ('[#16][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
+    'PubChemFP809': ('[#16][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
+    'PubChemFP810': ('[#16][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
+    'PubChemFP811': ('[#16][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
+    'PubChemFP812': ('[#7][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
+    'PubChemFP813': ('[#7][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
+    'PubChemFP814': ('[#7][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
+    'PubChemFP815': ('Cl[#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
+    'PubChemFP816': ('Cl[#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
+    'PubChemFP817': ('Br[#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
+    'PubChemFP818': ('[#6][#6]1[#6]([#6])[#6][#6][#6][#6]1', 0),
+    'PubChemFP819': ('[#6][#6]1[#6]([#8])[#6][#6][#6][#6]1', 0),
+    'PubChemFP820': ('[#6][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
+    'PubChemFP821': ('[#6][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
+    'PubChemFP822': ('[#6][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
+    'PubChemFP823': ('[#6][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
+    'PubChemFP824': ('[#8][#6]1[#6]([#8])[#6][#6][#6][#6]1', 0),
+    'PubChemFP825': ('[#8][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
+    'PubChemFP826': ('[#8][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
+    'PubChemFP827': ('[#8][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
+    'PubChemFP828': ('[#8][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
+    'PubChemFP829': ('[#16][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
+    'PubChemFP830': ('[#16][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
+    'PubChemFP831': ('[#16][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
+    'PubChemFP832': ('[#16][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
+    'PubChemFP833': ('[#7][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
+    'PubChemFP834': ('[#7][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
+    'PubChemFP835': ('[#7][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
+    'PubChemFP836': ('Cl[#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
+    'PubChemFP837': ('Cl[#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
+    'PubChemFP838': ('Br[#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
+    'PubChemFP839': ('[#6][#6]1[#6][#6]([#6])[#6][#6]1', 0),
+    'PubChemFP840': ('[#6][#6]1[#6][#6]([#8])[#6][#6]1', 0),
+    'PubChemFP841': ('[#6][#6]1[#6][#6]([#16])[#6][#6]1', 0),
+    'PubChemFP842': ('[#6][#6]1[#6][#6]([#7])[#6][#6]1', 0),
+    'PubChemFP843': ('[#6][#6]1[#6][#6](Cl)[#6][#6]1', 0),
+    'PubChemFP844': ('[#6][#6]1[#6][#6](Br)[#6][#6]1', 0),
+    'PubChemFP845': ('[#8][#6]1[#6][#6]([#8])[#6][#6]1', 0),
+    'PubChemFP846': ('[#8][#6]1[#6][#6]([#16])[#6][#6]1', 0),
+    'PubChemFP847': ('[#8][#6]1[#6][#6]([#7])[#6][#6]1', 0),
+    'PubChemFP848': ('[#8][#6]1[#6][#6](Cl)[#6][#6]1', 0),
+    'PubChemFP849': ('[#8][#6]1[#6][#6](Br)[#6][#6]1', 0),
+    'PubChemFP850': ('[#16][#6]1[#6][#6]([#16])[#6][#6]1', 0),
+    'PubChemFP851': ('[#16][#6]1[#6][#6]([#7])[#6][#6]1', 0),
+    'PubChemFP852': ('[#16][#6]1[#6][#6](Cl)[#6][#6]1', 0),
+    'PubChemFP853': ('[#16][#6]1[#6][#6](Br)[#6][#6]1', 0),
+    'PubChemFP854': ('[#7][#6]1[#6][#6]([#7])[#6][#6]1', 0),
+    'PubChemFP855': ('[#7][#6]1[#6][#6](Cl)[#6][#6]1', 0),
+    'PubChemFP856': ('[#7][#6]1[#6][#6](Br)[#6][#6]1', 0),
+    'PubChemFP857': ('Cl[#6]1[#6][#6](Cl)[#6][#6]1', 0),
+    'PubChemFP858': ('Cl[#6]1[#6][#6](Br)[#6][#6]1', 0),
+    'PubChemFP859': ('Br[#6]1[#6][#6](Br)[#6][#6]1', 0),
+    'PubChemFP860': ('[#6][#6]1[#6]([#6])[#6][#6][#6]1', 0),
+    'PubChemFP861': ('[#6][#6]1[#6]([#8])[#6][#6][#6]1', 0),
+    'PubChemFP862': ('[#6][#6]1[#6]([#16])[#6][#6][#6]1', 0),
+    'PubChemFP863': ('[#6][#6]1[#6]([#7])[#6][#6][#6]1', 0),
+    'PubChemFP864': ('[#6][#6]1[#6](Cl)[#6][#6][#6]1', 0),
+    'PubChemFP865': ('[#6][#6]1[#6](Br)[#6][#6][#6]1', 0),
+    'PubChemFP866': ('[#8][#6]1[#6]([#8])[#6][#6][#6]1', 0),
+    'PubChemFP867': ('[#8][#6]1[#6]([#16])[#6][#6][#6]1', 0),
+    'PubChemFP868': ('[#8][#6]1[#6]([#7])[#6][#6][#6]1', 0),
+    'PubChemFP869': ('[#8][#6]1[#6](Cl)[#6][#6][#6]1', 0),
+    'PubChemFP870': ('[#8][#6]1[#6](Br)[#6][#6][#6]1', 0),
+    'PubChemFP871': ('[#16][#6]1[#6]([#16])[#6][#6][#6]1', 0),
+    'PubChemFP872': ('[#16][#6]1[#6]([#7])[#6][#6][#6]1', 0),
+    'PubChemFP873': ('[#16][#6]1[#6](Cl)[#6][#6][#6]1', 0),
+    'PubChemFP874': ('[#16][#6]1[#6](Br)[#6][#6][#6]1', 0),
+    'PubChemFP875': ('[#7][#6]1[#6]([#7])[#6][#6][#6]1', 0),
+    'PubChemFP876': ('[#7][#6]1[#6](Cl)[#6][#6]1', 0),
+    'PubChemFP877': ('[#7][#6]1[#6](Br)[#6][#6][#6]1', 0),
+    'PubChemFP878': ('Cl[#6]1[#6](Cl)[#6][#6][#6]1', 0),
+    'PubChemFP879': ('Cl[#6]1[#6](Br)[#6][#6][#6]1', 0),
+    'PubChemFP880': ('Br[#6]1[#6](Br)[#6][#6][#6]1', 0)}

deepscreen/data/featurizers/fingerprint/torsions.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from rdkit.Chem.AtomPairs import Torsions
+from rdkit.Chem import DataStructs
+import numpy as np
+_type = 'topological-based'
+def GetTorsionFPs(mol, nBits=2048, binary=True):
+    '''
+    atompairs fingerprints
+    '''
+    fp = Torsions.GetHashedTopologicalTorsionFingerprint(mol, nBits=nBits)
+    if binary:
+        arr = np.zeros((0,), dtype=np.bool_)
+    else:
+        arr = np.zeros((0,), dtype=np.int8)
+    DataStructs.ConvertToNumpyArray(fp, arr)
+    return arr

deepscreen/data/featurizers/graph.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import networkx as nx
+import numpy as np
+import torch
+from rdkit import Chem
+from torch_geometric.utils import from_smiles
+from torch_geometric.data import Data
+from deepscreen.data.featurizers.categorical import one_of_k_encoding_unk, one_of_k_encoding
+from deepscreen.utils import get_logger
+log = get_logger(__name__)
+def atom_features(atom, explicit_H=False, use_chirality=True):
+    """
+    Adapted from TransformerCPI 2.0
+    """
+    symbol = ['C', 'N', 'O', 'F', 'P', 'S', 'Cl', 'Br', 'I', 'other']  # 10-dim
+    degree = [0, 1, 2, 3, 4, 5, 6]  # 7-dim
+    hybridization_type = [Chem.rdchem.HybridizationType.SP,
+                          Chem.rdchem.HybridizationType.SP2,
+                          Chem.rdchem.HybridizationType.SP3,
+                          Chem.rdchem.HybridizationType.SP3D,
+                          Chem.rdchem.HybridizationType.SP3D2,
+                          'other']  # 6-dim
+    # 10+7+2+6+1=26
+    results = one_of_k_encoding_unk(atom.GetSymbol(), symbol) + \
+              one_of_k_encoding(atom.GetDegree(), degree) + \
+              [atom.GetFormalCharge(), atom.GetNumRadicalElectrons()] + \
+              one_of_k_encoding_unk(atom.GetHybridization(), hybridization_type) + [atom.GetIsAromatic()]
+    # In case of explicit hydrogen(QM8, QM9), avoid calling `GetTotalNumHs`
+    # 26+5=31
+    if not explicit_H:
+        results = results + one_of_k_encoding_unk(atom.GetTotalNumHs(),
+                                                  [0, 1, 2, 3, 4])
+    # 31+3=34
+    if use_chirality:
+        try:
+            results = results + one_of_k_encoding_unk(
+                atom.GetProp('_CIPCode'),
+                ['R', 'S']) + [atom.HasProp('_ChiralityPossible')]
+        except:
+            results = results + [False, False] + [atom.HasProp('_ChiralityPossible')]
+    return np.array(results)
+def bond_features(bond):
+    bt = bond.GetBondType()
+    return np.array(
+        [bt == Chem.rdchem.BondType.SINGLE, bt == Chem.rdchem.BondType.DOUBLE, bt == Chem.rdchem.BondType.TRIPLE,
+         bt == Chem.rdchem.BondType.AROMATIC, bond.GetIsConjugated(), bond.IsInRing()])
+def smiles_to_graph_pyg(smiles):
+    """
+    Convert SMILES to graph with the default method defined by PyTorch Geometric
+    """
+    try:
+        return from_smiles(smiles)
+    except Exception as e:
+        log.warning(f"Failed to featurize the following SMILES to graph: {smiles} due to {str(e)}")
+        return None
+def smiles_to_graph(smiles, atom_features: callable = atom_features):
+    """
+    Convert SMILES to graph with custom atom_features
+    """
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+        features = []
+        for atom in mol.GetAtoms():
+            feature = atom_features(atom)
+            features.append(feature / sum(feature))
+        features = np.array(features)
+        edges = []
+        for bond in mol.GetBonds():
+            edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])
+        g = nx.Graph(edges).to_directed()
+        if len(edges) == 0:
+            edge_index = [[0, 0]]
+        else:
+            edge_index = []
+            for e1, e2 in g.edges:
+                edge_index.append([e1, e2])
+        return Data(x=torch.Tensor(features),
+                    edge_index=torch.LongTensor(edge_index).transpose(0, 1))
+    except Exception as e:
+        log.warning(f"Failed to convert SMILES ({smiles}) to graph due to {str(e)}")
+        return None
+    # features = []
+    # for atom in mol.GetAtoms():
+    #     feature = atom_features(atom)
+    #     features.append(feature / sum(feature))
+    #
+    # edge_indices = []
+    # for bond in mol.GetBonds():
+    #     i = bond.GetBeginAtomIdx()
+    #     j = bond.GetEndAtomIdx()
+    #     edge_indices += [[i, j], [j, i]]
+    #
+    # edge_index = torch.tensor(edge_indices)
+    # edge_index = edge_index.t().to(torch.long).view(2, -1)
+    #
+    # if edge_index.numel() > 0:  # Sort indices.
+    #     perm = (edge_index[0] * x.size(0) + edge_index[1]).argsort()
+    #     edge_index = edge_index[:, perm]
+    #
+def smiles_to_mol_features(smiles, num_atom_feat: callable):
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+        num_atom_feat = len(atom_features(mol.GetAtoms()[0]))
+        atom_feat = np.zeros((mol.GetNumAtoms(), num_atom_feat))
+        for atom in mol.GetAtoms():
+            atom_feat[atom.GetIdx(), :] = atom_features(atom)
+        adj = Chem.GetAdjacencyMatrix(mol)
+        adj_mat = np.array(adj)
+        return atom_feat, adj_mat
+    except Exception as e:
+        log.warning(f"Failed to featurize the following SMILES to molecular features: {smiles} due to {str(e)}")
+        return None

deepscreen/data/featurizers/monn.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import numpy as np
+from rdkit.Chem import MolFromSmiles
+from deepscreen.data.featurizers.categorical import FASTA_VOCAB, fasta_to_label
+from deepscreen.data.featurizers.graph import atom_features, bond_features
+def get_mask(arr):
+    a = np.zeros(1, len(arr))
+    a[1, :arr.shape[0]] = 1
+    return a
+def add_index(input_array, ebd_size):
+    batch_size, n_vertex, n_nbs = np.shape(input_array)
+    add_idx = np.array(range(0, ebd_size * batch_size, ebd_size) * (n_nbs * n_vertex))
+    add_idx = np.transpose(add_idx.reshape(-1, batch_size))
+    add_idx = add_idx.reshape(-1)
+    new_array = input_array.reshape(-1) + add_idx
+    return new_array
+# TODO fix padding and masking
+def drug_featurizer(smiles, max_neighbors=6):
+    mol = MolFromSmiles(smiles)
+    # convert molecule to GNN input
+    n_atoms = mol.GetNumAtoms()
+    assert mol.GetNumBonds() >= 0
+    n_bonds = max(mol.GetNumBonds(), 1)
+    feat_atoms = np.zeros((n_atoms,))  # atom feature ID
+    feat_bonds = np.zeros((n_bonds,))  # bond feature ID
+    atom_adj = np.zeros((n_atoms, max_neighbors))
+    bond_adj = np.zeros((n_atoms, max_neighbors))
+    n_neighbors = np.zeros((n_atoms,))
+    neighbor_mask = np.zeros((n_atoms, max_neighbors))
+    for atom in mol.GetAtoms():
+        idx = atom.GetIdx()
+        feat_atoms[idx] = atom_features(atom)
+    for bond in mol.GetBonds():
+        a1 = bond.GetBeginAtom().GetIdx()
+        a2 = bond.GetEndAtom().GetIdx()
+        idx = bond.GetIdx()
+        feat_bonds[idx] = bond_features(bond)
+        try:
+            atom_adj[a1, n_neighbors[a1]] = a2
+            atom_adj[a2, n_neighbors[a2]] = a1
+        except:
+            return [], [], [], [], []
+        bond_adj[a1, n_neighbors[a1]] = idx
+        bond_adj[a2, n_neighbors[a2]] = idx
+        n_neighbors[a1] += 1
+        n_neighbors[a2] += 1
+    for i in range(len(n_neighbors)):
+        neighbor_mask[i, :n_neighbors[i]] = 1
+    vertex_mask = get_mask(feat_atoms)
+    # vertex = pack_1d(feat_atoms)
+    # edge = pack_1d(feat_bonds)
+    # atom_adj = pack_2d(atom_adj)
+    # bond_adj = pack_2d(bond_adj)
+    # nbs_mask = pack_2d(n_neighbors_mat)
+    atom_adj = add_index(atom_adj, np.shape(atom_adj)[1])
+    bond_adj = add_index(bond_adj, np.shape(feat_bonds)[1])
+    return vertex_mask, feat_atoms, feat_bonds, atom_adj, bond_adj, neighbor_mask
+# TODO WIP the pairwise_label matrix probably should be generated beforehand and stored as an extra label in the dataset
+def get_pairwise_label(pdbid, interaction_dict, mol):
+    if pdbid in interaction_dict:
+        sdf_element = np.array([atom.GetSymbol().upper() for atom in mol.GetAtoms()])
+        atom_element = np.array(interaction_dict[pdbid]['atom_element'], dtype=str)
+        atom_name_list = np.array(interaction_dict[pdbid]['atom_name'], dtype=str)
+        atom_interact = np.array(interaction_dict[pdbid]['atom_interact'], dtype=int)
+        nonH_position = np.where(atom_element != 'H')[0]
+        assert sum(atom_element[nonH_position] != sdf_element) == 0
+        atom_name_list = atom_name_list[nonH_position].tolist()
+        pairwise_mat = np.zeros((len(nonH_position), len(interaction_dict[pdbid]['uniprot_seq'])), dtype=np.int32)
+        for atom_name, bond_type in interaction_dict[pdbid]['atom_bond_type']:
+            atom_idx = atom_name_list.index(str(atom_name))
+            assert atom_idx < len(nonH_position)
+            seq_idx_list = []
+            for seq_idx, bond_type_seq in interaction_dict[pdbid]['residue_bond_type']:
+                if bond_type == bond_type_seq:
+                    seq_idx_list.append(seq_idx)
+                    pairwise_mat[atom_idx, seq_idx] = 1
+        if len(np.where(pairwise_mat != 0)[0]) != 0:
+            pairwise_mask = True
+            return True, pairwise_mat
+    return False, np.zeros((1, 1))
+def protein_featurizer(fasta):
+    sequence = fasta_to_label(fasta)
+    # pad proteins and make masks
+    seq_mask = get_mask(sequence)
+    return seq_mask, sequence

deepscreen/data/featurizers/token.py ADDED Viewed

	@@ -0,0 +1,299 @@

+import collections
+from importlib import resources
+import os
+import re
+from typing import Optional, List
+import numpy as np
+from transformers import BertTokenizer
+SMI_REGEX_PATTERN = r"""(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"""
+# \[[^\]]+\] # match anything inside square brackets
+# |Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p # match elements
+# |\(|\) # match parentheses
+# |\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2} # match various symbols
+# |[0-9] # match digits
+def sequence_to_kmers(sequence, k=3):
+    """ Divide a string into a list of kmers strings.
+    Parameters:
+        sequence (string)
+        k (int), default 3
+    Returns:
+        List containing a list of kmers.
+    """
+    return [sequence[i:i + k] for i in range(len(sequence) - k + 1)]
+def sequence_to_word_embedding(sequence, model):
+    """Get protein embedding, infer a list of 3-mers to (num_word, 100) matrix"""
+    kmers = sequence_to_kmers(sequence)
+    vec = np.zeros((len(kmers), 100))
+    i = 0
+    for word in kmers:
+        try:
+            vec[i,] = model.wv[word]
+        except KeyError:
+            pass
+        i += 1
+    return vec
+def sequence_to_token_ids(sequence, tokenizer):
+    token_ids = tokenizer.encode(sequence)
+    return np.array(token_ids)
+# def sequence_to_token_ids(sequence, tokenizer, max_length: int):
+#     token_ids = tokenizer.encode(sequence)
+#     length = min(max_length, len(token_ids))
+#
+#     token_ids_padded = np.zeros(max_length, dtype='int')
+#     token_ids_padded[:length] = token_ids[:length]
+#
+#     return token_ids_padded
+class SmilesTokenizer(BertTokenizer):
+    """
+    Adapted from https://github.com/deepchem/deepchem/.
+    Creates the SmilesTokenizer class. The tokenizer heavily inherits from the BertTokenizer
+    implementation found in Huggingface's transformers library. It runs a WordPiece tokenization
+    algorithm over SMILES strings using the tokenization SMILES regex developed by Schwaller et al.
+    Please see https://github.com/huggingface/transformers
+    and https://github.com/rxn4chemistry/rxnfp for more details.
+    Examples
+    --------
+    >>> tokenizer = SmilesTokenizer(vocab_path, regex_pattern)
+    >>> print(tokenizer.encode("CC(=O)OC1=CC=CC=C1C(=O)O"))
+    [12, 16, 16, 17, 22, 19, 18, 19, 16, 20, 22, 16, 16, 22, 16, 16, 22, 16, 20, 16, 17, 22, 19, 18, 19, 13]
+    References
+    ----------
+    .. [1] Schwaller, Philippe; Probst, Daniel; Vaucher, Alain C.; Nair, Vishnu H; Kreutter, David;
+        Laino, Teodoro; et al. (2019): Mapping the Space of Chemical Reactions using Attention-Based Neural
+        Networks. ChemRxiv. Preprint. https://doi.org/10.26434/chemrxiv.9897365.v3
+    Note
+    ----
+    This class requires huggingface's transformers and tokenizers libraries to be installed.
+    """
+    def __init__(
+            self,
+            vocab_file: str = 'resources/vocabs/smiles.txt',
+            regex_pattern: str = SMI_REGEX_PATTERN,
+            # unk_token="[UNK]",
+            # sep_token="[SEP]",
+            # pad_token="[PAD]",
+            # cls_token="[CLS]",
+            # mask_token="[MASK]",
+            **kwargs):
+        """Constructs a SmilesTokenizer.
+        Parameters
+        ----------
+        vocab_file: str
+            Path to a SMILES character per line vocabulary file.
+            Default vocab file is found in deepchem/feat/tests/data/vocab.txt
+        """
+        super().__init__(vocab_file, **kwargs)
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocab file at path '{}'.".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        unused_indexes = [i for i, v in enumerate(self.vocab.keys()) if v.startswith("[unused")]
+        self.highest_unused_index = 0 if len(unused_indexes) == 0 else max(unused_indexes)
+        self.ids_to_tokens = collections.OrderedDict([
+            (ids, tok) for tok, ids in self.vocab.items()
+        ])
+        self.basic_tokenizer = BasicSmilesTokenizer(regex_pattern=regex_pattern)
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    @property
+    def vocab_list(self):
+        return list(self.vocab.keys())
+    def _tokenize(self, text: str, max_seq_length: int = 512, **kwargs):
+        """Tokenize a string into a list of tokens.
+        Parameters
+        ----------
+        text: str
+            Input string sequence to be tokenized.
+        """
+        max_len_single_sentence = max_seq_length - 2
+        split_tokens = [
+            token for token in self.basic_tokenizer.tokenize(text)
+            [:max_len_single_sentence]
+        ]
+        return split_tokens
+    def _convert_token_to_id(self, token: str):
+        """Converts a token (str/unicode) in an id using the vocab.
+        Parameters
+        ----------
+        token: str
+            String token from a larger sequence to be converted to a numerical id.
+        """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def _convert_id_to_token(self, index: int):
+        """Converts an index (integer) in a token (string/unicode) using the vocab.
+        Parameters
+        ----------
+        index: int
+            Integer index to be converted back to a string-based token as part of a larger sequence.
+        """
+        return self.ids_to_tokens.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens: List[str]):
+        """Converts a sequence of tokens (string) in a single string.
+        Parameters
+        ----------
+        tokens: List[str]
+            List of tokens for a given string sequence.
+        Returns
+        -------
+        out_string: str
+            Single string from combined tokens.
+        """
+        out_string: str = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+    def add_special_tokens_ids_single_sequence(self,
+                                               token_ids: List[Optional[int]]):
+        """Adds special tokens to a sequence for sequence classification tasks.
+        A BERT sequence has the following format: [CLS] X [SEP]
+        Parameters
+        ----------
+        token_ids: list[int]
+            list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
+        """
+        return [self.cls_token_id] + token_ids + [self.sep_token_id]
+    def add_special_tokens_single_sequence(self, tokens: List[str]):
+        """Adds special tokens to the a sequence for sequence classification tasks.
+        A BERT sequence has the following format: [CLS] X [SEP]
+        Parameters
+        ----------
+        tokens: List[str]
+            List of tokens for a given string sequence.
+        """
+        return [self.cls_token] + tokens + [self.sep_token]
+    def add_special_tokens_ids_sequence_pair(
+            self, token_ids_0: List[Optional[int]],
+            token_ids_1: List[Optional[int]]) -> List[Optional[int]]:
+        """Adds special tokens to a sequence pair for sequence classification tasks.
+        A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
+        Parameters
+        ----------
+        token_ids_0: List[int]
+            List of ids for the first string sequence in the sequence pair (A).
+        token_ids_1: List[int]
+            List of tokens for the second string sequence in the sequence pair (B).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def add_padding_tokens(self,
+                           token_ids: List[Optional[int]],
+                           length: int,
+                           right: bool = True) -> List[Optional[int]]:
+        """Adds padding tokens to return a sequence of length max_length.
+        By default padding tokens are added to the right of the sequence.
+        Parameters
+        ----------
+        token_ids: list[optional[int]]
+            list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
+        length: int
+        right: bool, default True
+        Returns
+        -------
+        List[int]
+        """
+        padding = [self.pad_token_id] * (length - len(token_ids))
+        if right:
+            return token_ids + padding
+        else:
+            return padding + token_ids
+class BasicSmilesTokenizer(object):
+    """
+    Adapted from https://github.com/deepchem/deepchem/.
+    Run basic SMILES tokenization using a regex pattern developed by Schwaller et. al.
+    This tokenizer is to be used when a tokenizer that does not require the transformers library by HuggingFace is required.
+    Examples
+    --------
+    >>> tokenizer = BasicSmilesTokenizer()
+    >>> print(tokenizer.tokenize("CC(=O)OC1=CC=CC=C1C(=O)O"))
+    ['C', 'C', '(', '=', 'O', ')', 'O', 'C', '1', '=', 'C', 'C', '=', 'C', 'C', '=', 'C', '1', 'C', '(', '=', 'O', ')', 'O']
+    References
+    ----------
+    .. [1] Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee
+        ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction
+        1572-1583 DOI: 10.1021/acscentsci.9b00576
+    """
+    def __init__(self, regex_pattern: str = SMI_REGEX_PATTERN):
+        """Constructs a BasicSMILESTokenizer.
+        Parameters
+        ----------
+        regex: string
+            SMILES token regex
+        """
+        self.regex_pattern = regex_pattern
+        self.regex = re.compile(self.regex_pattern)
+    def tokenize(self, text):
+        """Basic Tokenization of a SMILES.
+        """
+        tokens = [token for token in self.regex.findall(text)]
+        return tokens
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab

deepscreen/data/single_entity.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# from itertools import product
+from numbers import Number
+from pathlib import Path
+from typing import Any, Dict, Optional, Sequence, Union, Literal
+# import numpy as np
+import pandas as pd
+from lightning import LightningDataModule
+from sklearn.base import TransformerMixin
+from torch.utils.data import Dataset, DataLoader, random_split
+from deepscreen.data.utils.dataset import SingleEntitySingleTargetDataset, BaseEntityDataset
+from deepscreen.data.utils.label import label_transform
+from deepscreen.data.utils.collator import collate_fn
+from deepscreen.data.utils.sampler import SafeBatchSampler
+class EntityDataModule(LightningDataModule):
+    """
+    DTI DataModule
+    A DataModule implements 5 key methods:
+        def prepare_data(self):
+            # things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
+            # download data, pre-process, split, save to disk, etc.
+        def setup(self, stage):
+            # things to do on every process in DDP
+            # load data, set variables, etc.
+        def train_dataloader(self):
+            # return train dataloader
+        def val_dataloader(self):
+            # return validation dataloader
+        def test_dataloader(self):
+            # return test dataloader
+        def teardown(self):
+            # called on every process in DDP
+            # clean up after fit or test
+    This allows you to share a full dataset without explaining how to download,
+    split, transform and process the data.
+    Read the docs:
+        https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
+    """
+    def __init__(
+            self,
+            dataset: type[BaseEntityDataset],
+            task: Literal['regression', 'binary', 'multiclass'],
+            n_classes: Optional[int],
+            train: bool,
+            batch_size: int,
+            num_workers: int = 0,
+            thresholds: Optional[Union[Number, Sequence[Number]]] = None,
+            pin_memory: bool = False,
+            data_dir: str = "data/",
+            data_file: Optional[str] = None,
+            train_val_test_split: Optional[Sequence[Number], Sequence[str]] = None,
+            split: Optional[callable] = random_split,
+    ):
+        super().__init__()
+        data_path = Path(data_dir) / data_file
+        # this line allows to access init params with 'self.hparams' attribute
+        # also ensures init params will be stored in ckpt
+        self.save_hyperparameters(logger=False)
+        # data processing
+        self.split = split
+        if train:
+            if all([data_file, split]):
+                if all(isinstance(split, Number) for split in train_val_test_split):
+                    pass
+                else:
+                    raise ValueError('`train_val_test_split` must be a sequence of 3 numbers '
+                                     '(float for percentages and int for sample numbers) if '
+                                     '`data_file` and `split` have been specified.')
+            elif all(isinstance(split, str) for split in train_val_test_split) and not any([data_file, split]):
+                self.train_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[0]))
+                self.val_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[1]))
+                self.test_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[2]))
+            else:
+                raise ValueError('For training (train=True), you must specify either '
+                                 '`dataset_name` and `split` with `train_val_test_split` of 3 numbers or '
+                                 'solely `train_val_test_split` of 3 data file names.')
+        else:
+            if data_file and not any([split, train_val_test_split]):
+                self.test_data = self.predict_data = dataset(dataset_path=str(Path(data_dir) / data_file))
+            else:
+                raise ValueError("For testing/predicting (train=False), you must specify only `data_file` without "
+                                 "`train_val_test_split` or `split`")
+    def prepare_data(self):
+        """
+        Download data if needed.
+        Do not use it to assign state (e.g., self.x = x).
+        """
+    def setup(self, stage: Optional[str] = None, encoding: str = None):
+        """
+        Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
+        This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
+        careful not to execute data splitting twice.
+        """
+        # load and split datasets only if not loaded in initialization
+        if not any([self.data_train, self.data_val, self.data_test, self.data_predict]):
+            dataset = SingleEntitySingleTargetDataset(
+                task=self.hparams.task,
+                n_classes=self.hparams.n_classes,
+                dataset_path=Path(self.hparams.data_dir) / self.hparams.dataset_name,
+                transformer=self.hparams.transformer,
+                featurizer=self.hparams.featurizer,
+                thresholds=self.hparams.thresholds,
+            )
+            if self.hparams.train:
+                self.data_train, self.data_val, self.data_test = self.split(
+                    dataset=dataset,
+                    lengths=self.hparams.train_val_test_split
+                )
+            else:
+                self.data_test = self.data_predict = dataset
+    def train_dataloader(self):
+        return DataLoader(
+            dataset=self.data_train,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.data_train,
+                batch_size=self.hparams.batch_size,
+                shuffle=True),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=True,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=collate_fn,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            dataset=self.data_val,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.data_val,
+                batch_size=self.hparams.batch_size,
+                shuffle=False),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=False,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=collate_fn,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            dataset=self.data_test,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.data_test,
+                batch_size=self.hparams.batch_size,
+                shuffle=False),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=False,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=collate_fn,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def predict_dataloader(self):
+        return DataLoader(
+            dataset=self.data_predict,
+            batch_sampler=SafeBatchSampler(
+                data_source=self.data_predict,
+                batch_size=self.hparams.batch_size,
+                shuffle=False),
+            # batch_size=self.hparams.batch_size,
+            # shuffle=False,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            collate_fn=collate_fn,
+            persistent_workers=True if self.hparams.num_workers > 0 else False
+        )
+    def teardown(self, stage: Optional[str] = None):
+        """Clean up after fit or test."""
+        pass
+    def state_dict(self):
+        """Extra things to save to checkpoint."""
+        return {}
+    def load_state_dict(self, state_dict: Dict[str, Any]):
+        """Things to do when loading checkpoint."""
+        pass

deepscreen/data/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from typing import Dict, Sequence, TypeVar, Union
+from deepscreen.data.utils.collator import collate_fn
+from deepscreen.data.utils.label import label_transform
+from deepscreen.data.utils.sampler import SafeBatchSampler
+T = TypeVar('T')
+FlexibleIterable = Union[T, Sequence[T], Dict[str, T]]

deepscreen/data/utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (701 Bytes). View file

deepscreen/data/utils/__pycache__/collator.cpython-311.pyc ADDED Viewed

Binary file (4.97 kB). View file

deepscreen/data/utils/__pycache__/label.cpython-311.pyc ADDED Viewed

Binary file (4.88 kB). View file

deepscreen/data/utils/__pycache__/sampler.cpython-311.pyc ADDED Viewed

Binary file (3.56 kB). View file

deepscreen/data/utils/__pycache__/split.cpython-311.pyc ADDED Viewed

Binary file (5.68 kB). View file

deepscreen/data/utils/collator.py ADDED Viewed

	@@ -0,0 +1,168 @@

+"""
+Define collate functions for new data types here
+"""
+from functools import partial
+from itertools import chain
+import dgl
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data._utils.collate import default_collate_fn_map, collate_tensor_fn, collate
+import torch_geometric
+def collate_pyg_fn(batch, collate_fn_map=None):
+    """
+    PyG graph collation
+    """
+    return torch_geometric.data.Batch.from_data_list(batch)
+def collate_dgl_fn(batch, collate_fn_map=None):
+    """
+    DGL graph collation
+    """
+    return dgl.batch(batch)
+def pad_collate_tensor_fn(batch, padding_value=0.0, collate_fn_map=None):
+    """
+    Similar to pad_packed_sequence(pack_sequence(batch, enforce_sorted=False), batch_first=True),
+    but additionally supports padding a list of square Tensors of size ``(L x L x ...)``.
+    :param batch:
+    :param padding_value:
+    :param collate_fn_map:
+    :return: padded_batch, lengths
+    """
+    lengths = [tensor.size(0) for tensor in batch]
+    if any(element != lengths[0] for element in lengths[1:]):
+        try:
+            # Tensors share at least one common dimension size, use pad_sequence
+            batch = pad_sequence(batch, batch_first=True, padding_value=padding_value)
+        except RuntimeError:
+            # Tensors do not share any common dimension size, find the max size of each dimension in the batch
+            max_sizes = [max([tensor.size(dim) for tensor in batch]) for dim in range(batch[0].dim())]
+            # Pad every dimension of all tensors in the batch to be the respective max size with the value
+            batch = collate_tensor_fn([
+                torch.nn.functional.pad(
+                    tensor, tuple(chain.from_iterable(
+                        [(0, max_sizes[dim] - tensor.size(dim)) for dim in range(tensor.dim())][::-1])
+                    ), mode='constant', value=padding_value) for tensor in batch
+            ])
+    else:
+        batch = collate_tensor_fn(batch)
+    lengths = torch.as_tensor(lengths)
+    # Return the padded batch tensor and the lengths
+    return batch, lengths
+# Join custom collate functions with the default collation map of PyTorch
+COLLATE_FN_MAP = default_collate_fn_map | {
+    torch_geometric.data.data.BaseData: collate_pyg_fn,
+    dgl.DGLGraph: collate_dgl_fn,
+}
+def collate_fn(batch, automatic_padding=False, padding_value=0):
+    if automatic_padding:
+        COLLATE_FN_MAP.update({
+            torch.Tensor: partial(pad_collate_tensor_fn, padding_value=padding_value),
+        })
+    return collate(batch, collate_fn_map=COLLATE_FN_MAP)
+# class VariableLengthSequence(torch.Tensor):
+#     """
+#     A custom PyTorch Tensor class that is similar to PackedSequence, except it can be directly used as a batch tensor,
+#     and it has an attribute called lengths, which signifies the length of each original sequence in the batch.
+#     """
+#
+#     def __new__(cls, data, lengths):
+#         """
+#         Creates a new VariableLengthSequence object from the given data and lengths.
+#         Args:
+#             data (torch.Tensor): The batch collated tensor of shape (batch_size, max_length, *).
+#             lengths (torch.Tensor): The lengths of each original sequence in the batch of shape (batch_size,).
+#         Returns:
+#             VariableLengthSequence: A new VariableLengthSequence object.
+#         """
+#         # Check the validity of the inputs
+#         assert isinstance(data, torch.Tensor), "data must be a torch.Tensor"
+#         assert isinstance(lengths, torch.Tensor), "lengths must be a torch.Tensor"
+#         assert data.dim() >= 2, "data must have at least two dimensions"
+#         assert lengths.dim() == 1, "lengths must have one dimension"
+#         assert data.size(0) == lengths.size(0), "data and lengths must have the same batch size"
+#         assert lengths.min() > 0, "lengths must be positive"
+#         assert lengths.max() <= data.size(1), "lengths must not exceed the max length of data"
+#
+#         # Create a new tensor object from data
+#         obj = super().__new__(cls, data)
+#
+#         # Set the lengths attribute
+#         obj.lengths = lengths
+#
+#         return obj
+# class VariableLengthSequence(torch.Tensor):
+#     _lengths = torch.Tensor()
+#
+#     def __new__(cls, data, lengths, *args, **kwargs):
+#         self = super().__new__(cls, data, *args, **kwargs)
+#         self.lengths = lengths
+#         return self
+#
+#     def clone(self, *args, **kwargs):
+#         return VariableLengthSequence(super().clone(*args, **kwargs), self.lengths.clone())
+#
+#     def new_empty(self, *size):
+#         return VariableLengthSequence(super().new_empty(*size), self.lengths)
+#
+#     def to(self, *args, **kwargs):
+#         return VariableLengthSequence(super().to(*args, **kwargs), self.lengths.to(*args, **kwargs))
+#
+#     def __format__(self, format_spec):
+#         # Convert self to a string or a number here, depending on what you need
+#         return self.item().__format__(format_spec)
+#
+#     @property
+#     def lengths(self):
+#         return self._lengths
+#
+#     @lengths.setter
+#     def lengths(self, lengths):
+#         self._lengths = lengths
+#
+#     def cpu(self, *args, **kwargs):
+#         return VariableLengthSequence(super().cpu(*args, **kwargs), self.lengths.cpu(*args, **kwargs))
+#
+#     def cuda(self, *args, **kwargs):
+#         return VariableLengthSequence(super().cuda(*args, **kwargs), self.lengths.cuda(*args, **kwargs))
+#
+#     def pin_memory(self):
+#         return VariableLengthSequence(super().pin_memory(), self.lengths.pin_memory())
+#
+#     def share_memory_(self):
+#         super().share_memory_()
+#         self.lengths.share_memory_()
+#         return self
+#
+#     def detach_(self, *args, **kwargs):
+#         super().detach_(*args, **kwargs)
+#         self.lengths.detach_(*args, **kwargs)
+#         return self
+#
+#     def detach(self, *args, **kwargs):
+#         return VariableLengthSequence(super().detach(*args, **kwargs), self.lengths.detach(*args, **kwargs))
+#
+#     def record_stream(self, *args, **kwargs):
+#         super().record_stream(*args, **kwargs)
+#         self.lengths.record_stream(*args, **kwargs)
+#         return self
+    # @classmethod
+    # def __torch_function__(cls, func, types, args=(), kwargs=None):
+    #     return super().__torch_function__(func, types, args, kwargs) \
+    #         if cls.lengths is not None else torch.Tensor.__torch_function__(func, types, args, kwargs)

deepscreen/data/utils/dataset.py ADDED Viewed

	@@ -0,0 +1,216 @@

+from numbers import Number
+from typing import Literal, Union, Sequence
+import pandas as pd
+from sklearn.base import TransformerMixin
+from sklearn.exceptions import NotFittedError
+from sklearn.utils.validation import check_is_fitted
+from torch.utils.data import Dataset
+from deepscreen.data.utils import label_transform, FlexibleIterable
+class BaseEntityDataset(Dataset):
+    def __init__(
+            self,
+            dataset_path: str,
+            use_col_prefixes=('X', 'Y', 'ID', 'U')
+    ):
+        # Read the data table header row first to filter columns and create column dtype dict
+        df = pd.read_csv(
+            dataset_path,
+            header=0, nrows=0,
+            usecols=lambda col: col.startswith(use_col_prefixes)
+        )
+        # Read the whole data table
+        df = pd.read_csv(
+            dataset_path,
+            header=0,
+            usecols=df.columns,
+            dtype={col: 'float32' if col.startswith('Y') else 'string' for col in df.columns}
+        )
+        self.df = df
+        self.label_cols = [col for col in df.columns if col.startswith('Y')]
+        self.label_unit_cols = [col for col in df.columns if col.startswith('U')]
+        self.entity_id_cols = [col for col in df.columns if col.startswith('ID')]
+        self.entity_cols = [col for col in df.columns if col.startswith('X')]
+    def __len__(self):
+        return len(self.df.index)
+    def __getitem__(self, idx):
+        raise NotImplementedError
+# TODO test transform
+class SingleEntitySingleTargetDataset(BaseEntityDataset):
+    def __init__(
+            self,
+            dataset_path: str,
+            task: Literal['regression', 'binary', 'multiclass'],
+            n_classes: int,
+            featurizer: callable,
+            transformer: TransformerMixin = None,
+            thresholds: Union[Number, Sequence[Number]] = None,
+            discard_intermediate: bool = None,
+            forward_fill: bool = True
+    ):
+        super().__init__(dataset_path)
+        assert len(self.entity_cols) == 1, 'The dataset contains more than 1 entity column (starting with `X`).'
+        if len(self.label_cols) >= 0:
+            assert len(self.label_cols) == 1, 'The dataset contains more than 1 label column (starting with `Y`).'
+        # Remove trailing `1`s in column names for flexibility
+        self.df.columns = self.df.columns.str.rstrip('1')
+        # Forward-fill non-label columns
+        nonlabel_cols = self.label_unit_cols + self.entity_id_cols + self.entity_cols
+        if forward_fill:
+            self.df[nonlabel_cols] = self.df[nonlabel_cols].ffill(axis=0)
+        # Process target labels for training/testing if exist
+        if self.label_cols:
+            # Transform target labels
+            self.df[self.label_cols] = self.df[self.label_cols].apply(
+                label_transform,
+                units=self.df.get('U', None),
+                thresholds=thresholds,
+                discard_intermediate=discard_intermediate).astype('float32')
+            # Filter out rows with a NaN in Y (missing values); use inplace to save memory
+            self.df.dropna(subset=self.label_cols, inplace=True)
+            # Validate target labels
+            # TODO: check sklearn.utils.multiclass.check_classification_targets
+            match task:
+                case 'regression':
+                    assert all(self.df['Y'].apply(lambda x: isinstance(x, Number))), \
+                        f"Y for task `regression` must be numeric; got {set(self.df['Y'].apply(type))}."
+                case 'binary':
+                    assert all(self.df['Y'].isin([0, 1])), \
+                        f"Y for task `binary` (classification) must be 0 or 1, but Y got {pd.unique(self.df['Y'])}." \
+                        "\nYou may set `thresholds` to discretize continuous labels."
+                case 'multiclass':
+                    assert n_classes >= 3, f'n_classes for task `multiclass` (classification) must be at least 3.'
+                    assert all(self.df['Y'].apply(lambda x: x.is_integer() and x >= 0)), \
+                        f"``Y` for task `multiclass` (classification) must be non-negative integers, " \
+                        f"but `Y` got {pd.unique(self.df['Y'])}." \
+                        "\nYou may set `thresholds` to discretize continuous labels."
+                    target_n_unique = self.df['Y'].nunique()
+                    assert target_n_unique == n_classes, \
+                        f"You have set n_classes for task `multiclass` (classification) task to {n_classes}, " \
+                        f"but `Y` has {target_n_unique} unique labels."
+        if transformer:
+            self.df['X'] = self.df['X'].apply(featurizer)
+            try:
+                check_is_fitted(transformer)
+                self.df['X'] = list(transformer.transform(self.df['X']))
+            except NotFittedError:
+                self.df['X'] = list(transformer.fit_transform(self.df['X']))
+            # Skip sample-wise feature extraction because it has already been done dataset-wise
+            self.featurizer = lambda x: x
+        self.featurizer = featurizer
+        self.n_classes = n_classes
+        self.df['ID'] = self.df.get('ID', self.df['X'])
+    def __getitem__(self, idx):
+        sample = self.df.loc[idx]
+        return {
+            'X': self.featurizer(sample['X']),
+            'ID': sample['ID'],
+            'Y': sample.get('Y')
+        }
+# TODO WIP
+class MultiEntityMultiTargetDataset(BaseEntityDataset):
+    def __init__(
+            self,
+            dataset_path: str,
+            task: FlexibleIterable[Literal['regression', 'binary', 'multiclass']],
+            n_class: FlexibleIterable[int],
+            featurizers: FlexibleIterable[callable],
+            thresholds: FlexibleIterable[Union[Number, Sequence[Number]]] = None,
+            discard_intermediate: FlexibleIterable[bool] = None,
+    ):
+        super().__init__(dataset_path)
+        label_col_prefix = tuple('Y')
+        nonlabel_col_prefixes = tuple(('X', 'ID', 'U'))
+        allowed_col_prefixes = label_col_prefix + nonlabel_col_prefixes
+        # Read the headers first to filter columns and create column dtype dict
+        df = pd.read_csv(
+            dataset_path,
+            header=0, nrows=0,
+            usecols=lambda col: col.startswith(allowed_col_prefixes)
+        )
+        # Read the whole table
+        df = pd.read_csv(
+            dataset_path,
+            header=0,
+            usecols=df.columns,
+            dtype={col: 'float32' if col.startswith('Y') else 'string' for col in df.columns}
+        )
+        label_cols = [col for col in df.columns if col.startswith(label_col_prefix)]
+        nonlabel_cols = [col for col in df.columns if col.startswith(nonlabel_col_prefixes)]
+        self.entity_cols = [col for col in nonlabel_cols if col.startswith('X')]
+        # Forward-fill all non-label columns
+        df[nonlabel_cols] = df[nonlabel_cols].ffill(axis=0)
+        # Process target labels for training/testing
+        if label_cols:
+            # Transform target labels
+            df[label_cols] = df[label_cols].apply(label_transform, units=df.get('U', None), thresholds=thresholds,
+                                                  discard_intermediate=discard_intermediate).astype('float32')
+            # Filter out rows with a NaN in Y (missing values)
+            df.dropna(subset=label_cols, inplace=True)
+            # Validate target labels
+            # TODO: check sklearn.utils.multiclass.check_classification_targets
+            # WIP
+            match task:
+                case 'regression':
+                    assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
+                        f"Y for task `regression` must be numeric; got {set(df['Y'].apply(type))}."
+                case 'binary':
+                    assert all(df['Y'].isin([0, 1])), \
+                        f"Y for task `binary` must be 0 or 1, but Y got {pd.unique(df['Y'])}." \
+                        "\nYou may set `thresholds` to discretize continuous labels."
+                case 'multiclass':
+                    assert len(label_cols) == len(n_class), \
+                        (f'Data table has {len(label_cols)} label columns (`Y*`) but you have specified '
+                         f'n_class of length {len(n_class)} for task `multiclass`.')
+                    for label, n in zip(df[label_cols], n_class):
+                        assert n >= 3, f'n_class for task `multiclass` must be at least 3.'
+                        assert all(label.apply(lambda x: x.is_integer() and x >= 0)), \
+                            f"Y for task `multiclass` must be non-negative integers, " \
+                            f"but Y got {pd.unique(label)}." \
+                            "\nYou may set `thresholds` to discretize continuous labels."
+                        target_n_unique = label.nunique()
+                        assert target_n_unique == n, \
+                            f"You have set n_classes for task `multiclass` task to {n}, " \
+                            f"but Y has {target_n_unique} unique labels."
+        self.df = df
+        self.featurizers = featurizers
+        self.n_class = n_class
+    def __len__(self):
+        return len(self.df.index)
+    # WIP
+    def __getitem__(self, idx):
+        sample = self.df.loc[idx]
+        return {
+            'X': [featurizer(x) for featurizer, x in zip(self.featurizers, sample[self.entity_cols])],
+            'ID': sample.get('ID', sample['X']),
+            'Y': sample.get('Y')
+        }

deepscreen/data/utils/label.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from numbers import Number
+from typing import Optional, Union
+import numpy as np
+from deepscreen.utils import get_logger
+log = get_logger(__name__)
+MOLARITY_TO_POTENCY = {
+    'p': lambda x: x,
+    'M': lambda x: -np.log10(x),
+    'mM': lambda x: -np.log10(x) + 3,
+    'μM': lambda x: -np.log10(x) + 6,
+    'uM': lambda x: -np.log10(x) + 6,  # in case someone doesn't know how to type micromolar lol
+    'nM': lambda x: -np.log10(x) + 9,
+    'pM': lambda x: -np.log10(x) + 12,
+    'fM': lambda x: -np.log10(x) + 15,
+}
+# TODO rewrite for swifter.apply
+def molar_to_p(labels, units):
+    assert units in MOLARITY_TO_POTENCY, f"Allowed units: {', '.join(MOLARITY_TO_POTENCY)}."
+    unit_converted_labels = []
+    for label, unit in (labels, units):
+        unit_converted_labels.append(MOLARITY_TO_POTENCY[unit](label))
+    labels = np.array(unit_converted_labels)
+    return labels
+def label_discretize(labels, thresholds):
+    # if isinstance(threshold, Number):
+    #     labels = np.where(labels < threshold, 1, 0)
+    # else:
+    #     labels = np.where(labels < threshold[0], 1, np.where(labels > threshold[1], 0, np.nan))
+    if isinstance(thresholds, Number):
+        labels = 1 - np.digitize(labels, [thresholds])
+    else:
+        labels = np.digitize(labels, np.sort(thresholds)[::-1])
+    return labels
+def label_transform(
+        labels,
+        units: Optional[list[str]],
+        thresholds: Optional[Union[float, list[Number]]],
+        discard_intermediate: Optional[bool]
+):
+    f"""Convert labels of all units to p scale (-log10[M]) and binarize them if specified.
+        :param labels: a sequence of labels, continuous or binary values
+        :type labels: array_like
+        :param units: a sequence of label units in {', '.join(MOLARITY_TO_POTENCY)}
+        :type units: array_like, optional
+        :param thresholds: discretization threshold(s) for affinity labels, in p scale (-log10[M]).
+        A single number maps affinities below it to 1 and otherwise to 0.
+        A tuple of two or more thresholds maps affinities to multiple discrete levels descendingly, assigning values
+        values below the lowest threshold to the highest level (e.g. 2) and values above the greatest threshold to 0
+        :type thresholds: list, float, optional
+        :param discard_intermediate: whether to discard the intermediate (indeterminate) level if provided an odd
+        number of thresholds (>=3)
+        :type discard_intermediate: bool
+        :return: a numpy array of affinity labels in p scale (-log10[M]) or discrete labels
+    """
+    # # Check if labels are already discrete (ignoring NAs).
+    # discrete = labels.dropna().isin([0, 1]).all()
+    #
+    # if discrete:
+    #     assert discretize, "Cannot train a regression model with discrete labels."
+    #     if thresholds:
+    #         warn("Ignoring 'threshold' because 'Y' (labels) in the data table is already binary.")
+    #     if units:
+    #         warn("Ignoring 'units' because 'Y' (labels) in the data table is already binary.")
+    #     labels = labels
+    if units:
+        labels = molar_to_p(labels, units)
+    if thresholds:
+        labels = label_discretize(labels, thresholds)
+        if discard_intermediate:
+            assert len(thresholds) % 2 == 1 and len(thresholds) >= 3, \
+                "Must give an odd number of (at least 3) thresholds to discard the intermediate level."
+            intermediate_level = len(thresholds) // 2
+            # Make the intermediate-level labels NaN (which will be filtered out later)
+            labels[labels == intermediate_level] = np.nan
+            # Reduce all levels above the intermediate level by 1
+            labels[labels > intermediate_level] -= 1
+    return labels

deepscreen/data/utils/sampler.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from typing import Mapping, Iterable
+from torch.utils.data import BatchSampler, RandomSampler, SequentialSampler
+class SafeBatchSampler(BatchSampler):
+    """
+    A safe `batch_sampler` that skips samples with `None` values, supports shuffling, and keep a fixed batch size.
+    Args:
+        data_source (Dataset): The dataset to sample from.
+        batch_size (int): The size of each batch.
+        drop_last (bool): Whether to drop the last batch if its size is smaller than `batch_size`. Defaults to `False`.
+        shuffle (bool, optional): Whether to shuffle the data before sampling. Defaults to `True`.
+    Example:
+        >>> dataloader = DataLoader(dataset, batch_sampler=SafeBatchSampler(dataset, batch_size, drop_last, shuffle))
+    """
+    def __init__(self, data_source, batch_size: int, drop_last: bool, shuffle: bool, sampler=None):
+        if not isinstance(batch_size, int) or isinstance(batch_size, bool) or \
+                batch_size <= 0:
+            raise ValueError(f"batch_size should be a positive integer value, but got batch_size={batch_size}")
+        if not isinstance(drop_last, bool):
+            raise ValueError(f"drop_last should be a boolean value, but got drop_last={drop_last}")
+        if sampler:
+            pass
+        elif shuffle:
+            sampler = RandomSampler(data_source)  # type: ignore[arg-type]
+        else:
+            sampler = SequentialSampler(data_source)  # type: ignore[arg-type]
+        super().__init__(sampler, batch_size, drop_last)
+        self.data_source = data_source
+    # def __iter__(self):
+    #     batch = []
+    #     for idx in self.sampler:
+    #         sample = self.data_source[idx]
+    #         # if isinstance(sample, list | tuple):
+    #         #     pass
+    #         # elif isinstance(sample, dict):
+    #         #     sample = sample.values()
+    #         # elif isinstance(sample, Series):
+    #         #     sample = sample.values
+    #         # else:
+    #         #     sample = [sample]
+    #         if isinstance(sample, (Iterable, Mapping)) and not isinstance(sample, str):
+    #             if isinstance(sample, Mapping):
+    #                 sample = list(sample.values())
+    #         else:
+    #             sample = [sample]
+    #
+    #         if all(v is not None for v in sample):
+    #             batch.append(idx)
+    #             if len(batch) == self.batch_size:
+    #                 yield batch
+    #                 batch = []
+    #
+    #     if len(batch) > 0 and not self.drop_last:
+    #         yield batch
+    #
+    #     if not batch:
+    #         raise StopIteration
+    def __iter__(self):
+        batch = [0] * self.batch_size
+        idx_in_batch = 0
+        for idx in self.sampler:
+            sample = self.data_source[idx]
+            if isinstance(sample, (Iterable, Mapping)) and not isinstance(sample, str):
+                if isinstance(sample, Mapping):
+                    sample = sample.values()
+            else:
+                sample = [sample]
+            if all(v is not None for v in sample):
+                batch[idx_in_batch] = idx
+                idx_in_batch += 1
+                if idx_in_batch == self.batch_size:
+                    yield batch
+                    idx_in_batch = 0
+                    batch = [0] * self.batch_size
+        if idx_in_batch > 0 and not self.drop_last:
+            yield batch[:idx_in_batch]
+        if not any(batch):
+            # raise StopIteration
+            return