Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Upload 110 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- deepscreen/__init__.py +101 -0
- deepscreen/__pycache__/__init__.cpython-311.pyc +0 -0
- deepscreen/__pycache__/predict.cpython-311.pyc +0 -0
- deepscreen/data/__init__.py +0 -0
- deepscreen/data/__pycache__/__init__.cpython-311.pyc +0 -0
- deepscreen/data/__pycache__/dti.cpython-311.pyc +0 -0
- deepscreen/data/dti.py +422 -0
- deepscreen/data/dti.py.bak +369 -0
- deepscreen/data/dti_datamodule.py +314 -0
- deepscreen/data/entity_datamodule.py +167 -0
- deepscreen/data/featurizers/__init__.py +0 -0
- deepscreen/data/featurizers/__pycache__/__init__.cpython-311.pyc +0 -0
- deepscreen/data/featurizers/__pycache__/categorical.cpython-311.pyc +0 -0
- deepscreen/data/featurizers/__pycache__/token.cpython-311.pyc +0 -0
- deepscreen/data/featurizers/categorical.py +86 -0
- deepscreen/data/featurizers/chem.py +48 -0
- deepscreen/data/featurizers/fcs.py +67 -0
- deepscreen/data/featurizers/fingerprint/__init__.py +45 -0
- deepscreen/data/featurizers/fingerprint/atompairs.py +18 -0
- deepscreen/data/featurizers/fingerprint/avalonfp.py +16 -0
- deepscreen/data/featurizers/fingerprint/estatefp.py +12 -0
- deepscreen/data/featurizers/fingerprint/maccskeys.py +25 -0
- deepscreen/data/featurizers/fingerprint/maccskeys.xlsx +0 -0
- deepscreen/data/featurizers/fingerprint/map4.py +130 -0
- deepscreen/data/featurizers/fingerprint/mhfp6.py +18 -0
- deepscreen/data/featurizers/fingerprint/mnimalfatures.fdef +53 -0
- deepscreen/data/featurizers/fingerprint/morganfp.py +18 -0
- deepscreen/data/featurizers/fingerprint/pharmErGfp.py +60 -0
- deepscreen/data/featurizers/fingerprint/pharmPointfp.py +59 -0
- deepscreen/data/featurizers/fingerprint/pubchemfp.py +1731 -0
- deepscreen/data/featurizers/fingerprint/pubchemfp.xlsx +0 -0
- deepscreen/data/featurizers/fingerprint/rdkitfp.py +42 -0
- deepscreen/data/featurizers/fingerprint/smarts_maccskey.py +178 -0
- deepscreen/data/featurizers/fingerprint/smarts_pharmacophore.py +21 -0
- deepscreen/data/featurizers/fingerprint/smarts_pubchem.py +734 -0
- deepscreen/data/featurizers/fingerprint/torsions.py +18 -0
- deepscreen/data/featurizers/graph.py +133 -0
- deepscreen/data/featurizers/monn.py +106 -0
- deepscreen/data/featurizers/token.py +299 -0
- deepscreen/data/single_entity.py +195 -0
- deepscreen/data/utils/__init__.py +8 -0
- deepscreen/data/utils/__pycache__/__init__.cpython-311.pyc +0 -0
- deepscreen/data/utils/__pycache__/collator.cpython-311.pyc +0 -0
- deepscreen/data/utils/__pycache__/label.cpython-311.pyc +0 -0
- deepscreen/data/utils/__pycache__/sampler.cpython-311.pyc +0 -0
- deepscreen/data/utils/__pycache__/split.cpython-311.pyc +0 -0
- deepscreen/data/utils/collator.py +168 -0
- deepscreen/data/utils/dataset.py +216 -0
- deepscreen/data/utils/label.py +93 -0
- deepscreen/data/utils/sampler.py +90 -0
deepscreen/__init__.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
DeepScreen package initialization, registering custom objects and monkey patching for some libraries.
|
3 |
+
"""
|
4 |
+
import sys
|
5 |
+
from builtins import eval
|
6 |
+
|
7 |
+
import lightning.fabric.strategies.launchers.subprocess_script as subprocess_script
|
8 |
+
import torch
|
9 |
+
from omegaconf import OmegaConf
|
10 |
+
|
11 |
+
from deepscreen.utils import get_logger
|
12 |
+
|
13 |
+
log = get_logger(__name__)
|
14 |
+
|
15 |
+
# Allow basic Python operations in hydra interpolation; examples:
|
16 |
+
# `in_channels: ${eval:${model.drug_encoder.out_channels}+${model.protein_encoder.out_channels}}`
|
17 |
+
# `subdir: ${eval:${hydra.job.override_dirname}.replace('/', '.')}`
|
18 |
+
OmegaConf.register_new_resolver("eval", eval)
|
19 |
+
|
20 |
+
|
21 |
+
def sanitize_path(path_str: str):
|
22 |
+
"""
|
23 |
+
Sanitize a string for path creation by replacing unsafe characters and cutting length to 255 (OS limitation).
|
24 |
+
"""
|
25 |
+
return path_str.replace("/", ".").replace("\\", ".").replace(":", "-")[:255]
|
26 |
+
|
27 |
+
|
28 |
+
OmegaConf.register_new_resolver("sanitize_path", sanitize_path)
|
29 |
+
|
30 |
+
|
31 |
+
def _hydra_subprocess_cmd(local_rank: int):
|
32 |
+
"""
|
33 |
+
Monkey patching for lightning.fabric.strategies.launchers.subprocess_script._hydra_subprocess_cmd
|
34 |
+
Temporarily fixes the problem of unnecessarily creating log folders for DDP subprocesses in Hydra multirun/sweep.
|
35 |
+
"""
|
36 |
+
import __main__ # local import to avoid https://github.com/Lightning-AI/lightning/issues/15218
|
37 |
+
from hydra.core.hydra_config import HydraConfig
|
38 |
+
from hydra.utils import get_original_cwd, to_absolute_path
|
39 |
+
|
40 |
+
# when user is using hydra find the absolute path
|
41 |
+
if __main__.__spec__ is None: # pragma: no-cover
|
42 |
+
command = [sys.executable, to_absolute_path(sys.argv[0])]
|
43 |
+
else:
|
44 |
+
command = [sys.executable, "-m", __main__.__spec__.name]
|
45 |
+
|
46 |
+
command += sys.argv[1:]
|
47 |
+
|
48 |
+
cwd = get_original_cwd()
|
49 |
+
rundir = f'"{HydraConfig.get().runtime.output_dir}"'
|
50 |
+
# Set output_subdir null since we don't want different subprocesses trying to write to config.yaml
|
51 |
+
command += [f"hydra.job.name=train_ddp_process_{local_rank}",
|
52 |
+
"hydra.output_subdir=null,"
|
53 |
+
f"hydra.runtime.output_dir={rundir}"]
|
54 |
+
return command, cwd
|
55 |
+
|
56 |
+
|
57 |
+
subprocess_script._hydra_subprocess_cmd = _hydra_subprocess_cmd
|
58 |
+
|
59 |
+
# from torch import Tensor
|
60 |
+
# from lightning.fabric.utilities.distributed import _distributed_available
|
61 |
+
# from lightning.pytorch.utilities.rank_zero import WarningCache
|
62 |
+
# from lightning.pytorch.utilities.warnings import PossibleUserWarning
|
63 |
+
# from lightning.pytorch.trainer.connectors.logger_connector.result import _ResultCollection
|
64 |
+
|
65 |
+
# warning_cache = WarningCache()
|
66 |
+
#
|
67 |
+
# @staticmethod
|
68 |
+
# def _get_cache(result_metric, on_step: bool):
|
69 |
+
# cache = None
|
70 |
+
# if on_step and result_metric.meta.on_step:
|
71 |
+
# cache = result_metric._forward_cache
|
72 |
+
# elif not on_step and result_metric.meta.on_epoch:
|
73 |
+
# if result_metric._computed is None:
|
74 |
+
# should = result_metric.meta.sync.should
|
75 |
+
# if not should and _distributed_available() and result_metric.is_tensor:
|
76 |
+
# warning_cache.warn(
|
77 |
+
# f"It is recommended to use `self.log({result_metric.meta.name!r}, ..., sync_dist=True)`"
|
78 |
+
# " when logging on epoch level in distributed setting to accumulate the metric across"
|
79 |
+
# " devices.",
|
80 |
+
# category=PossibleUserWarning,
|
81 |
+
# )
|
82 |
+
# result_metric.compute()
|
83 |
+
# result_metric.meta.sync.should = should
|
84 |
+
#
|
85 |
+
# cache = result_metric._computed
|
86 |
+
#
|
87 |
+
# if cache is not None:
|
88 |
+
# if isinstance(cache, Tensor):
|
89 |
+
# if not result_metric.meta.enable_graph:
|
90 |
+
# return cache.detach()
|
91 |
+
#
|
92 |
+
# return cache
|
93 |
+
#
|
94 |
+
#
|
95 |
+
# _ResultCollection._get_cache = _get_cache
|
96 |
+
|
97 |
+
if torch.cuda.is_available():
|
98 |
+
if torch.cuda.get_device_capability() >= (8, 0):
|
99 |
+
torch.set_float32_matmul_precision("high")
|
100 |
+
log.info("Your GPU supports tensor cores, "
|
101 |
+
"we will enable it automatically by setting `torch.set_float32_matmul_precision('high')`")
|
deepscreen/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (3.28 kB). View file
|
|
deepscreen/__pycache__/predict.cpython-311.pyc
ADDED
Binary file (3.38 kB). View file
|
|
deepscreen/data/__init__.py
ADDED
File without changes
|
deepscreen/data/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (179 Bytes). View file
|
|
deepscreen/data/__pycache__/dti.cpython-311.pyc
ADDED
Binary file (23 kB). View file
|
|
deepscreen/data/dti.py
ADDED
@@ -0,0 +1,422 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from functools import partial
|
3 |
+
from numbers import Number
|
4 |
+
from pathlib import Path
|
5 |
+
from typing import Any, Dict, Optional, Sequence, Union, Literal
|
6 |
+
|
7 |
+
from lightning import LightningDataModule
|
8 |
+
import pandas as pd
|
9 |
+
import swifter
|
10 |
+
from sklearn.preprocessing import LabelEncoder
|
11 |
+
from torch.utils.data import Dataset, DataLoader
|
12 |
+
|
13 |
+
from deepscreen.data.utils import label_transform, collate_fn, SafeBatchSampler
|
14 |
+
from deepscreen.utils import get_logger
|
15 |
+
|
16 |
+
log = get_logger(__name__)
|
17 |
+
|
18 |
+
SMILES_PAT = r"[^A-Za-z0-9=#:+\-\[\]<>()/\\@%,.*]"
|
19 |
+
FASTA_PAT = r"[^A-Z*\-]"
|
20 |
+
|
21 |
+
|
22 |
+
def validate_seq_str(seq, regex):
|
23 |
+
if seq:
|
24 |
+
err_charset = set(re.findall(regex, seq))
|
25 |
+
if not err_charset:
|
26 |
+
return None
|
27 |
+
else:
|
28 |
+
return ', '.join(err_charset)
|
29 |
+
else:
|
30 |
+
return 'Empty string'
|
31 |
+
|
32 |
+
|
33 |
+
# TODO: save a list of corrupted records
|
34 |
+
|
35 |
+
def rdkit_canonicalize(smiles):
|
36 |
+
from rdkit import Chem
|
37 |
+
try:
|
38 |
+
mol = Chem.MolFromSmiles(smiles)
|
39 |
+
cano_smiles = Chem.MolToSmiles(mol)
|
40 |
+
return cano_smiles
|
41 |
+
except Exception as e:
|
42 |
+
log.warning(f'Failed to canonicalize SMILES using RDKIT due to {str(e)}. Returning original SMILES: {smiles}')
|
43 |
+
return smiles
|
44 |
+
|
45 |
+
|
46 |
+
class DTIDataset(Dataset):
|
47 |
+
def __init__(
|
48 |
+
self,
|
49 |
+
task: Literal['regression', 'binary', 'multiclass'],
|
50 |
+
num_classes: Optional[int],
|
51 |
+
data_path: str | Path,
|
52 |
+
drug_featurizer: callable,
|
53 |
+
protein_featurizer: callable,
|
54 |
+
thresholds: Optional[Union[Number, Sequence[Number]]] = None,
|
55 |
+
discard_intermediate: Optional[bool] = False,
|
56 |
+
query: Optional[str] = 'X2'
|
57 |
+
):
|
58 |
+
df = pd.read_csv(
|
59 |
+
data_path,
|
60 |
+
engine='python',
|
61 |
+
header=0,
|
62 |
+
usecols=lambda x: x in ['X1', 'ID1', 'X2', 'ID2', 'Y', 'U'],
|
63 |
+
dtype={
|
64 |
+
'X1': 'str',
|
65 |
+
'ID1': 'str',
|
66 |
+
'X2': 'str',
|
67 |
+
'ID2': 'str',
|
68 |
+
'Y': 'float32',
|
69 |
+
'U': 'str',
|
70 |
+
},
|
71 |
+
)
|
72 |
+
# Read the whole data table
|
73 |
+
|
74 |
+
# if 'ID1' in df:
|
75 |
+
# self.x1_to_id1 = dict(zip(df['X1'], df['ID1']))
|
76 |
+
# if 'ID2' in df:
|
77 |
+
# self.x2_to_id2 = dict(zip(df['X2'], df['ID2']))
|
78 |
+
# self.id2_to_indexes = dict(zip(df['ID2'], range(len(df['ID2']))))
|
79 |
+
# self.x2_to_indexes = dict(zip(df['X2'], range(len(df['X2']))))
|
80 |
+
|
81 |
+
# # train and eval mode data processing (fully labelled)
|
82 |
+
# if 'Y' in df.columns and df['Y'].notnull().all():
|
83 |
+
log.info(f"Processing data file: {data_path}")
|
84 |
+
|
85 |
+
# Forward-fill all non-label columns
|
86 |
+
df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
|
87 |
+
|
88 |
+
# TODO potentially allow running through the whole data validation process
|
89 |
+
# error = False
|
90 |
+
|
91 |
+
if 'Y' in df:
|
92 |
+
log.info(f"Validating labels (`Y`)...")
|
93 |
+
# TODO: check sklearn.utils.multiclass.check_classification_targets
|
94 |
+
match task:
|
95 |
+
case 'regression':
|
96 |
+
assert all(df['Y'].swifter.apply(lambda x: isinstance(x, Number))), \
|
97 |
+
f"""`Y` must be numeric for `regression` task,
|
98 |
+
but it has {set(df['Y'].swifter.apply(type))}."""
|
99 |
+
|
100 |
+
case 'binary':
|
101 |
+
if all(df['Y'].isin([0, 1])):
|
102 |
+
assert not thresholds, \
|
103 |
+
f"""`Y` is already 0 or 1 for `binary` (classification) `task`,
|
104 |
+
but still got `thresholds` ({thresholds}).
|
105 |
+
Double check your choices of `task` and `thresholds`, and records in the `Y` column."""
|
106 |
+
else:
|
107 |
+
assert thresholds, \
|
108 |
+
f"""`Y` must be 0 or 1 for `binary` (classification) `task`,
|
109 |
+
but it has {pd.unique(df['Y'])}.
|
110 |
+
You may set `thresholds` to discretize continuous labels.""" # TODO print err idx instead
|
111 |
+
|
112 |
+
case 'multiclass':
|
113 |
+
assert num_classes >= 3, f'`num_classes` for `task=multiclass` must be at least 3.'
|
114 |
+
|
115 |
+
if all(df['Y'].swifter.apply(lambda x: x.is_integer() and x >= 0)):
|
116 |
+
assert not thresholds, \
|
117 |
+
f"""`Y` is already non-negative integers for
|
118 |
+
`multiclass` (classification) `task`, but still got `thresholds` ({thresholds}).
|
119 |
+
Double check your choice of `task`, `thresholds` and records in the `Y` column."""
|
120 |
+
else:
|
121 |
+
assert thresholds, \
|
122 |
+
f"""`Y` must be non-negative integers for
|
123 |
+
`multiclass` (classification) 'task',but it has {pd.unique(df['Y'])}.
|
124 |
+
You must set `thresholds` to discretize continuous labels.""" # TODO print err idx instead
|
125 |
+
|
126 |
+
if 'U' in df.columns:
|
127 |
+
units = df['U']
|
128 |
+
else:
|
129 |
+
units = None
|
130 |
+
log.warning("Units ('U') not in the data table. "
|
131 |
+
"Assuming all labels to be discrete or in p-scale (-log10[M]).")
|
132 |
+
|
133 |
+
# Transform labels
|
134 |
+
df['Y'] = label_transform(labels=df['Y'], units=units, thresholds=thresholds,
|
135 |
+
discard_intermediate=discard_intermediate)
|
136 |
+
|
137 |
+
# Filter out rows with a NaN in Y (missing values)
|
138 |
+
df.dropna(subset=['Y'], inplace=True)
|
139 |
+
|
140 |
+
match task:
|
141 |
+
case 'regression':
|
142 |
+
df['Y'] = df['Y'].astype('float32')
|
143 |
+
assert all(df['Y'].swifter.apply(lambda x: isinstance(x, Number))), \
|
144 |
+
f"""`Y` must be numeric for `regression` task,
|
145 |
+
but after transformation it still has {set(df['Y'].swifter.apply(type))}.
|
146 |
+
Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
|
147 |
+
# TODO print err idx instead
|
148 |
+
case 'binary':
|
149 |
+
df['Y'] = df['Y'].astype('int')
|
150 |
+
assert all(df['Y'].isin([0, 1])), \
|
151 |
+
f"""`Y` must be 0 or 1 for `task=binary`, "
|
152 |
+
but after transformation it still has {pd.unique(df['Y'])}.
|
153 |
+
Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
|
154 |
+
# TODO print err idx instead
|
155 |
+
case 'multiclass':
|
156 |
+
df['Y'] = df['Y'].astype('int')
|
157 |
+
assert all(df['Y'].swifter.apply(lambda x: x.is_integer() and x >= 0)), \
|
158 |
+
f"""Y must be non-negative integers for `task=multiclass`
|
159 |
+
but after transformation it still has {pd.unique(df['Y'])}.
|
160 |
+
Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
|
161 |
+
# TODO print err idx instead
|
162 |
+
target_n_unique = df['Y'].nunique()
|
163 |
+
assert target_n_unique == num_classes, \
|
164 |
+
f"""You have set `num_classes` for `task=multiclass` to {num_classes},
|
165 |
+
but after transformation Y still has {target_n_unique} unique labels.
|
166 |
+
Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
|
167 |
+
|
168 |
+
log.info("Validating SMILES (`X1`)...")
|
169 |
+
df['X1_ERR'] = df['X1'].swifter.progress_bar(
|
170 |
+
desc="Validating SMILES...").apply(validate_seq_str, regex=SMILES_PAT)
|
171 |
+
if not df['X1_ERR'].isna().all():
|
172 |
+
raise Exception(f"Encountered invalid SMILES:\n{df[~df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
|
173 |
+
df['X1^'] = df['X1'].apply(rdkit_canonicalize) # swifter
|
174 |
+
|
175 |
+
log.info("Validating FASTA (`X2`)...")
|
176 |
+
df['X2'] = df['X2'].str.upper()
|
177 |
+
df['X2_ERR'] = df['X2'].swifter.progress_bar(
|
178 |
+
desc="Validating FASTA...").apply(validate_seq_str, regex=FASTA_PAT)
|
179 |
+
if not df['X2_ERR'].isna().all():
|
180 |
+
raise Exception(f"Encountered invalid FASTA:\n{df[~df['X2_ERR'].isna()][['X2', 'X2_ERR']]}")
|
181 |
+
|
182 |
+
# FASTA/SMILES indices as query for retrieval metrics like enrichment factor and hit rate
|
183 |
+
if query:
|
184 |
+
df['ID^'] = LabelEncoder().fit_transform(df[query])
|
185 |
+
|
186 |
+
self.df = df
|
187 |
+
self.drug_featurizer = drug_featurizer if drug_featurizer is not None else (lambda x: x)
|
188 |
+
self.protein_featurizer = protein_featurizer if protein_featurizer is not None else (lambda x: x)
|
189 |
+
|
190 |
+
def __len__(self):
|
191 |
+
return len(self.df.index)
|
192 |
+
|
193 |
+
def __getitem__(self, i):
|
194 |
+
sample = self.df.loc[i]
|
195 |
+
return {
|
196 |
+
'N': i,
|
197 |
+
'X1': sample['X1'],
|
198 |
+
'X1^': self.drug_featurizer(sample['X1^']),
|
199 |
+
'ID1': sample.get('ID1'),
|
200 |
+
'X2': sample['X2'],
|
201 |
+
'X2^': self.protein_featurizer(sample['X2']),
|
202 |
+
'ID2': sample.get('ID2'),
|
203 |
+
'Y': sample.get('Y'),
|
204 |
+
'ID^': sample.get('ID^'),
|
205 |
+
}
|
206 |
+
|
207 |
+
|
208 |
+
class DTIDataModule(LightningDataModule):
|
209 |
+
"""
|
210 |
+
DTI DataModule
|
211 |
+
|
212 |
+
A DataModule implements 5 key methods:
|
213 |
+
|
214 |
+
def prepare_data(self):
|
215 |
+
# things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
|
216 |
+
# download data, pre-process, split, save to disk, etc.
|
217 |
+
def setup(self, stage):
|
218 |
+
# things to do on every process in DDP
|
219 |
+
# load data, set variables, etc.
|
220 |
+
def train_dataloader(self):
|
221 |
+
# return train dataloader
|
222 |
+
def val_dataloader(self):
|
223 |
+
# return validation dataloader
|
224 |
+
def test_dataloader(self):
|
225 |
+
# return test dataloader
|
226 |
+
def teardown(self):
|
227 |
+
# called on every process in DDP
|
228 |
+
# clean up after fit or test
|
229 |
+
|
230 |
+
This allows you to share a full dataset without explaining how to download,
|
231 |
+
split, transform and process the data.
|
232 |
+
|
233 |
+
Read the docs:
|
234 |
+
https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
|
235 |
+
"""
|
236 |
+
|
237 |
+
def __init__(
|
238 |
+
self,
|
239 |
+
task: Literal['regression', 'binary', 'multiclass'],
|
240 |
+
num_classes: Optional[int],
|
241 |
+
batch_size: int,
|
242 |
+
# train: bool,
|
243 |
+
drug_featurizer: callable,
|
244 |
+
protein_featurizer: callable,
|
245 |
+
collator: callable = collate_fn,
|
246 |
+
data_dir: str = "data/",
|
247 |
+
data_file: Optional[str] = None,
|
248 |
+
train_val_test_split: Optional[Union[Sequence[Number | str]]] = None,
|
249 |
+
split: Optional[callable] = None,
|
250 |
+
thresholds: Optional[Union[Number, Sequence[Number]]] = None,
|
251 |
+
discard_intermediate: Optional[bool] = False,
|
252 |
+
num_workers: int = 0,
|
253 |
+
pin_memory: bool = False,
|
254 |
+
):
|
255 |
+
super().__init__()
|
256 |
+
|
257 |
+
self.train_data: Optional[Dataset] = None
|
258 |
+
self.val_data: Optional[Dataset] = None
|
259 |
+
self.test_data: Optional[Dataset] = None
|
260 |
+
self.predict_data: Optional[Dataset] = None
|
261 |
+
self.split = split
|
262 |
+
self.collator = collator
|
263 |
+
self.dataset = partial(
|
264 |
+
DTIDataset,
|
265 |
+
task=task,
|
266 |
+
num_classes=num_classes,
|
267 |
+
drug_featurizer=drug_featurizer,
|
268 |
+
protein_featurizer=protein_featurizer,
|
269 |
+
thresholds=thresholds,
|
270 |
+
discard_intermediate=discard_intermediate
|
271 |
+
)
|
272 |
+
|
273 |
+
# this line allows to access init params with 'self.hparams' ensures init params will be stored in ckpt
|
274 |
+
self.save_hyperparameters(logger=False) # ignore=['split']
|
275 |
+
|
276 |
+
def prepare_data(self):
|
277 |
+
"""
|
278 |
+
Download data if needed.
|
279 |
+
Do not use it to assign state (e.g., self.x = x).
|
280 |
+
"""
|
281 |
+
|
282 |
+
def setup(self, stage: Optional[str] = None, encoding: str = None):
|
283 |
+
"""
|
284 |
+
Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
|
285 |
+
This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
|
286 |
+
careful not to execute data splitting twice.
|
287 |
+
"""
|
288 |
+
# load and split datasets only if not loaded in initialization
|
289 |
+
if not any([self.train_data, self.test_data, self.val_data, self.predict_data]):
|
290 |
+
if self.hparams.train_val_test_split:
|
291 |
+
if len(self.hparams.train_val_test_split) != 3:
|
292 |
+
raise ValueError('Length of `train_val_test_split` must be 3. '
|
293 |
+
'Set the second element to None for training without validation. '
|
294 |
+
'Set the third element to None for training without testing.')
|
295 |
+
|
296 |
+
self.train_data = self.hparams.train_val_test_split[0]
|
297 |
+
self.val_data = self.hparams.train_val_test_split[1]
|
298 |
+
self.test_data = self.hparams.train_val_test_split[2]
|
299 |
+
|
300 |
+
if all([self.hparams.data_file, self.split]):
|
301 |
+
if all(isinstance(split, Number) or split is None
|
302 |
+
for split in self.hparams.train_val_test_split):
|
303 |
+
split_data = self.split(
|
304 |
+
dataset=self.dataset(data_path=Path(self.hparams.data_dir, self.hparams.data_file)),
|
305 |
+
lengths=[split for split in self.hparams.train_val_test_split if split is not None]
|
306 |
+
)
|
307 |
+
for dataset in ['train_data', 'val_data', 'test_data']:
|
308 |
+
if getattr(self, dataset) is not None:
|
309 |
+
setattr(self, dataset, split_data.pop(0))
|
310 |
+
|
311 |
+
else:
|
312 |
+
raise ValueError('`train_val_test_split` must be a sequence numbers or None'
|
313 |
+
'(float for percentages and int for sample numbers) '
|
314 |
+
'if both `data_file` and `split` have been specified.')
|
315 |
+
|
316 |
+
elif (all(isinstance(split, str) or split is None
|
317 |
+
for split in self.hparams.train_val_test_split)
|
318 |
+
and not any([self.hparams.data_file, self.split])):
|
319 |
+
for dataset in ['train_data', 'val_data', 'test_data']:
|
320 |
+
if getattr(self, dataset) is not None:
|
321 |
+
data_path = Path(getattr(self, dataset))
|
322 |
+
if not data_path.is_absolute():
|
323 |
+
data_path = Path(self.hparams.data_dir, data_path)
|
324 |
+
setattr(self, dataset, self.dataset(data_path=data_path))
|
325 |
+
|
326 |
+
else:
|
327 |
+
raise ValueError('For training, you must specify either all of `data_file`, `split`, '
|
328 |
+
'and `train_val_test_split` as a sequence of numbers or '
|
329 |
+
'solely `train_val_test_split` as a sequence of data file paths.')
|
330 |
+
|
331 |
+
elif self.hparams.data_file and not any([self.split, self.hparams.train_val_test_split]):
|
332 |
+
data_path = Path(self.hparams.data_file)
|
333 |
+
if not data_path.is_absolute():
|
334 |
+
data_path = Path(self.hparams.data_dir, data_path)
|
335 |
+
self.test_data = self.predict_data = self.dataset(data_path=data_path)
|
336 |
+
|
337 |
+
else:
|
338 |
+
raise ValueError("For training, you must specify `train_val_test_split`. "
|
339 |
+
"For testing/predicting, you must specify only `data_file` without "
|
340 |
+
"`train_val_test_split` or `split`.")
|
341 |
+
|
342 |
+
def train_dataloader(self):
|
343 |
+
return DataLoader(
|
344 |
+
dataset=self.train_data,
|
345 |
+
batch_sampler=SafeBatchSampler(
|
346 |
+
data_source=self.train_data,
|
347 |
+
batch_size=self.hparams.batch_size,
|
348 |
+
# Dropping the last batch prevents problems caused by variable batch sizes in training, e.g.,
|
349 |
+
# batch_size=1 in BatchNorm, and shuffling ensures the model be trained on all samples over epochs.
|
350 |
+
drop_last=True,
|
351 |
+
shuffle=True,
|
352 |
+
),
|
353 |
+
# batch_size=self.hparams.batch_size,
|
354 |
+
# shuffle=True,
|
355 |
+
num_workers=self.hparams.num_workers,
|
356 |
+
pin_memory=self.hparams.pin_memory,
|
357 |
+
collate_fn=self.collator,
|
358 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
359 |
+
)
|
360 |
+
|
361 |
+
def val_dataloader(self):
|
362 |
+
return DataLoader(
|
363 |
+
dataset=self.val_data,
|
364 |
+
batch_sampler=SafeBatchSampler(
|
365 |
+
data_source=self.val_data,
|
366 |
+
batch_size=self.hparams.batch_size,
|
367 |
+
drop_last=False,
|
368 |
+
shuffle=False
|
369 |
+
),
|
370 |
+
# batch_size=self.hparams.batch_size,
|
371 |
+
# shuffle=False,
|
372 |
+
num_workers=self.hparams.num_workers,
|
373 |
+
pin_memory=self.hparams.pin_memory,
|
374 |
+
collate_fn=self.collator,
|
375 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
376 |
+
)
|
377 |
+
|
378 |
+
def test_dataloader(self):
|
379 |
+
return DataLoader(
|
380 |
+
dataset=self.test_data,
|
381 |
+
batch_sampler=SafeBatchSampler(
|
382 |
+
data_source=self.test_data,
|
383 |
+
batch_size=self.hparams.batch_size,
|
384 |
+
drop_last=False,
|
385 |
+
shuffle=False
|
386 |
+
),
|
387 |
+
# batch_size=self.hparams.batch_size,
|
388 |
+
# shuffle=False,
|
389 |
+
num_workers=self.hparams.num_workers,
|
390 |
+
pin_memory=self.hparams.pin_memory,
|
391 |
+
collate_fn=self.collator,
|
392 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
393 |
+
)
|
394 |
+
|
395 |
+
def predict_dataloader(self):
|
396 |
+
return DataLoader(
|
397 |
+
dataset=self.predict_data,
|
398 |
+
batch_sampler=SafeBatchSampler(
|
399 |
+
data_source=self.predict_data,
|
400 |
+
batch_size=self.hparams.batch_size,
|
401 |
+
drop_last=False,
|
402 |
+
shuffle=False
|
403 |
+
),
|
404 |
+
# batch_size=self.hparams.batch_size,
|
405 |
+
# shuffle=False,
|
406 |
+
num_workers=self.hparams.num_workers,
|
407 |
+
pin_memory=self.hparams.pin_memory,
|
408 |
+
collate_fn=self.collator,
|
409 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
410 |
+
)
|
411 |
+
|
412 |
+
def teardown(self, stage: Optional[str] = None):
|
413 |
+
"""Clean up after fit or test."""
|
414 |
+
pass
|
415 |
+
|
416 |
+
def state_dict(self):
|
417 |
+
"""Extra things to save to checkpoint."""
|
418 |
+
return {}
|
419 |
+
|
420 |
+
def load_state_dict(self, state_dict: Dict[str, Any]):
|
421 |
+
"""Things to do when loading checkpoint."""
|
422 |
+
pass
|
deepscreen/data/dti.py.bak
ADDED
@@ -0,0 +1,369 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from functools import partial
|
2 |
+
from numbers import Number
|
3 |
+
from pathlib import Path
|
4 |
+
from typing import Any, Dict, Optional, Sequence, Union, Literal
|
5 |
+
|
6 |
+
from lightning import LightningDataModule
|
7 |
+
import pandas as pd
|
8 |
+
from sklearn.preprocessing import LabelEncoder
|
9 |
+
from torch.utils.data import Dataset, DataLoader
|
10 |
+
|
11 |
+
from deepscreen.data.utils import label_transform, collate_fn, SafeBatchSampler
|
12 |
+
from deepscreen.utils import get_logger
|
13 |
+
|
14 |
+
log = get_logger(__name__)
|
15 |
+
|
16 |
+
|
17 |
+
# TODO: save a list of corrupted records
|
18 |
+
|
19 |
+
|
20 |
+
class DTIDataset(Dataset):
|
21 |
+
def __init__(
|
22 |
+
self,
|
23 |
+
task: Literal['regression', 'binary', 'multiclass'],
|
24 |
+
n_class: Optional[int],
|
25 |
+
data_path: str | Path,
|
26 |
+
drug_featurizer: callable,
|
27 |
+
protein_featurizer: callable,
|
28 |
+
thresholds: Optional[Union[Number, Sequence[Number]]] = None,
|
29 |
+
discard_intermediate: Optional[bool] = False,
|
30 |
+
):
|
31 |
+
df = pd.read_csv(
|
32 |
+
data_path,
|
33 |
+
engine='python',
|
34 |
+
header=0,
|
35 |
+
usecols=lambda x: x in ['X1', 'ID1', 'X2', 'ID2', 'Y', 'U'],
|
36 |
+
dtype={
|
37 |
+
'X1': 'str',
|
38 |
+
'ID1': 'str',
|
39 |
+
'X2': 'str',
|
40 |
+
'ID2': 'str',
|
41 |
+
'Y': 'float32',
|
42 |
+
'U': 'str',
|
43 |
+
},
|
44 |
+
)
|
45 |
+
# Read the whole data table
|
46 |
+
|
47 |
+
# if 'ID1' in df:
|
48 |
+
# self.x1_to_id1 = dict(zip(df['X1'], df['ID1']))
|
49 |
+
# if 'ID2' in df:
|
50 |
+
# self.x2_to_id2 = dict(zip(df['X2'], df['ID2']))
|
51 |
+
# self.id2_to_indexes = dict(zip(df['ID2'], range(len(df['ID2']))))
|
52 |
+
# self.x2_to_indexes = dict(zip(df['X2'], range(len(df['X2']))))
|
53 |
+
|
54 |
+
# # train and eval mode data processing (fully labelled)
|
55 |
+
# if 'Y' in df.columns and df['Y'].notnull().all():
|
56 |
+
log.info(f"Processing data file: {data_path}")
|
57 |
+
|
58 |
+
# Forward-fill all non-label columns
|
59 |
+
df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
|
60 |
+
|
61 |
+
if 'Y' in df:
|
62 |
+
log.info(f"Performing pre-transformation target validation.")
|
63 |
+
# TODO: check sklearn.utils.multiclass.check_classification_targets
|
64 |
+
match task:
|
65 |
+
case 'regression':
|
66 |
+
assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
|
67 |
+
f"""`Y` must be numeric for `regression` task,
|
68 |
+
but it has {set(df['Y'].apply(type))}."""
|
69 |
+
|
70 |
+
case 'binary':
|
71 |
+
if all(df['Y'].isin([0, 1])):
|
72 |
+
assert not thresholds, \
|
73 |
+
f"""`Y` is already 0 or 1 for `binary` (classification) `task`,
|
74 |
+
but still got `thresholds` {thresholds}.
|
75 |
+
Double check your choices of `task` and `thresholds` and records in the `Y` column."""
|
76 |
+
else:
|
77 |
+
assert thresholds, \
|
78 |
+
f"""`Y` must be 0 or 1 for `binary` (classification) `task`,
|
79 |
+
but it has {pd.unique(df['Y'])}.
|
80 |
+
You must set `thresholds` to discretize continuous labels."""
|
81 |
+
|
82 |
+
case 'multiclass':
|
83 |
+
assert n_class >= 3, f'`n_class` for `multiclass` (classification) `task` must be at least 3.'
|
84 |
+
|
85 |
+
if all(df['Y'].apply(lambda x: x.is_integer() and x >= 0)):
|
86 |
+
assert not thresholds, \
|
87 |
+
f"""`Y` is already non-negative integers for
|
88 |
+
`multiclass` (classification) `task`, but still got `thresholds` {thresholds}.
|
89 |
+
Double check your choice of `task`, `thresholds` and records in the `Y` column."""
|
90 |
+
else:
|
91 |
+
assert thresholds, \
|
92 |
+
f"""`Y` must be non-negative integers for
|
93 |
+
`multiclass` (classification) 'task',but it has {pd.unique(df['Y'])}.
|
94 |
+
You must set `thresholds` to discretize continuous labels."""
|
95 |
+
|
96 |
+
if 'U' in df.columns:
|
97 |
+
units = df['U']
|
98 |
+
else:
|
99 |
+
units = None
|
100 |
+
log.warning("Units ('U') not in the data table. "
|
101 |
+
"Assuming all labels to be discrete or in p-scale (-log10[M]).")
|
102 |
+
|
103 |
+
# Transform labels
|
104 |
+
df['Y'] = label_transform(labels=df['Y'], units=units, thresholds=thresholds,
|
105 |
+
discard_intermediate=discard_intermediate)
|
106 |
+
|
107 |
+
# Filter out rows with a NaN in Y (missing values)
|
108 |
+
df.dropna(subset=['Y'], inplace=True)
|
109 |
+
|
110 |
+
log.info(f"Performing post-transformation target validation.")
|
111 |
+
match task:
|
112 |
+
case 'regression':
|
113 |
+
df['Y'] = df['Y'].astype('float32')
|
114 |
+
assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
|
115 |
+
f"""`Y` must be numeric for `regression` task,
|
116 |
+
but after transformation it still has {set(df['Y'].apply(type))}.
|
117 |
+
Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
|
118 |
+
|
119 |
+
case 'binary':
|
120 |
+
df['Y'] = df['Y'].astype('int')
|
121 |
+
assert all(df['Y'].isin([0, 1])), \
|
122 |
+
f"""`Y` must be 0 or 1 for `binary` (classification) `task`, "
|
123 |
+
but after transformation it still has {pd.unique(df['Y'])}.
|
124 |
+
Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
|
125 |
+
|
126 |
+
case 'multiclass':
|
127 |
+
df['Y'] = df['Y'].astype('int')
|
128 |
+
assert all(df['Y'].apply(lambda x: x.is_integer() and x >= 0)), \
|
129 |
+
f"""Y must be non-negative integers for task `multiclass` (classification)
|
130 |
+
but after transformation it still has {pd.unique(df['Y'])}.
|
131 |
+
Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
|
132 |
+
|
133 |
+
target_n_unique = df['Y'].nunique()
|
134 |
+
assert target_n_unique == n_class, \
|
135 |
+
f"""You have set `n_class` for `multiclass` (classification) `task` to {n_class},
|
136 |
+
but after transformation Y still has {target_n_unique} unique labels.
|
137 |
+
Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
|
138 |
+
|
139 |
+
# Indexed protein/FASTA for retrieval metrics
|
140 |
+
df['IDX'] = LabelEncoder().fit_transform(df['X2'])
|
141 |
+
|
142 |
+
self.df = df
|
143 |
+
self.drug_featurizer = drug_featurizer if drug_featurizer is not None else (lambda x: x)
|
144 |
+
self.protein_featurizer = protein_featurizer if protein_featurizer is not None else (lambda x: x)
|
145 |
+
|
146 |
+
def __len__(self):
|
147 |
+
return len(self.df.index)
|
148 |
+
|
149 |
+
def __getitem__(self, i):
|
150 |
+
sample = self.df.loc[i]
|
151 |
+
return {
|
152 |
+
'N': i,
|
153 |
+
'X1': self.drug_featurizer(sample['X1']),
|
154 |
+
'ID1': sample.get('ID1', sample['X1']),
|
155 |
+
'X2': self.protein_featurizer(sample['X2']),
|
156 |
+
'ID2': sample.get('ID2', sample['X2']),
|
157 |
+
'Y': sample.get('Y'),
|
158 |
+
'IDX': sample['IDX'],
|
159 |
+
}
|
160 |
+
|
161 |
+
|
162 |
+
class DTIDataModule(LightningDataModule):
|
163 |
+
"""
|
164 |
+
DTI DataModule
|
165 |
+
|
166 |
+
A DataModule implements 5 key methods:
|
167 |
+
|
168 |
+
def prepare_data(self):
|
169 |
+
# things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
|
170 |
+
# download data, pre-process, split, save to disk, etc.
|
171 |
+
def setup(self, stage):
|
172 |
+
# things to do on every process in DDP
|
173 |
+
# load data, set variables, etc.
|
174 |
+
def train_dataloader(self):
|
175 |
+
# return train dataloader
|
176 |
+
def val_dataloader(self):
|
177 |
+
# return validation dataloader
|
178 |
+
def test_dataloader(self):
|
179 |
+
# return test dataloader
|
180 |
+
def teardown(self):
|
181 |
+
# called on every process in DDP
|
182 |
+
# clean up after fit or test
|
183 |
+
|
184 |
+
This allows you to share a full dataset without explaining how to download,
|
185 |
+
split, transform and process the data.
|
186 |
+
|
187 |
+
Read the docs:
|
188 |
+
https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
|
189 |
+
"""
|
190 |
+
|
191 |
+
def __init__(
|
192 |
+
self,
|
193 |
+
task: Literal['regression', 'binary', 'multiclass'],
|
194 |
+
n_class: Optional[int],
|
195 |
+
batch_size: int,
|
196 |
+
# train: bool,
|
197 |
+
drug_featurizer: callable,
|
198 |
+
protein_featurizer: callable,
|
199 |
+
collator: callable = collate_fn,
|
200 |
+
data_dir: str = "data/",
|
201 |
+
data_file: Optional[str] = None,
|
202 |
+
train_val_test_split: Optional[Union[Sequence[Number | str]]] = None,
|
203 |
+
split: Optional[callable] = None,
|
204 |
+
thresholds: Optional[Union[Number, Sequence[Number]]] = None,
|
205 |
+
discard_intermediate: Optional[bool] = False,
|
206 |
+
num_workers: int = 0,
|
207 |
+
pin_memory: bool = False,
|
208 |
+
):
|
209 |
+
super().__init__()
|
210 |
+
|
211 |
+
self.train_data: Optional[Dataset] = None
|
212 |
+
self.val_data: Optional[Dataset] = None
|
213 |
+
self.test_data: Optional[Dataset] = None
|
214 |
+
self.predict_data: Optional[Dataset] = None
|
215 |
+
self.split = split
|
216 |
+
self.collator = collator
|
217 |
+
self.dataset = partial(
|
218 |
+
DTIDataset,
|
219 |
+
task=task,
|
220 |
+
n_class=n_class,
|
221 |
+
drug_featurizer=drug_featurizer,
|
222 |
+
protein_featurizer=protein_featurizer,
|
223 |
+
thresholds=thresholds,
|
224 |
+
discard_intermediate=discard_intermediate
|
225 |
+
)
|
226 |
+
|
227 |
+
if train_val_test_split:
|
228 |
+
# TODO test behavior for trainer.test and predict when this is passed
|
229 |
+
if len(train_val_test_split) not in [2, 3]:
|
230 |
+
raise ValueError('Length of `train_val_test_split` must be 2 (for training without testing) or 3.')
|
231 |
+
if all([data_file, split]):
|
232 |
+
if all(isinstance(split, Number) for split in train_val_test_split):
|
233 |
+
pass
|
234 |
+
else:
|
235 |
+
raise ValueError('`train_val_test_split` must be a sequence numbers '
|
236 |
+
'(float for percentages and int for sample numbers) '
|
237 |
+
'if both `data_file` and `split` have been specified.')
|
238 |
+
elif all(isinstance(split, str) for split in train_val_test_split) and not any([data_file, split]):
|
239 |
+
split_paths = []
|
240 |
+
for split in train_val_test_split:
|
241 |
+
split = Path(split)
|
242 |
+
if not split.is_absolute():
|
243 |
+
split = Path(data_dir, split)
|
244 |
+
split_paths.append(split)
|
245 |
+
|
246 |
+
self.train_data = self.dataset(data_path=split_paths[0])
|
247 |
+
self.val_data = self.dataset(data_path=split_paths[1])
|
248 |
+
if len(train_val_test_split) == 3:
|
249 |
+
self.test_data = self.dataset(data_path=split_paths[2])
|
250 |
+
else:
|
251 |
+
raise ValueError('For training, you must specify either `data_file`, `split`, '
|
252 |
+
'and `train_val_test_split` as a sequence of numbers or '
|
253 |
+
'solely `train_val_test_split` as a sequence of data file paths.')
|
254 |
+
|
255 |
+
elif data_file and not any([split, train_val_test_split]):
|
256 |
+
data_file = Path(data_file)
|
257 |
+
if not data_file.is_absolute():
|
258 |
+
data_file = Path(data_dir, data_file)
|
259 |
+
self.test_data = self.predict_data = self.dataset(data_path=data_file)
|
260 |
+
else:
|
261 |
+
raise ValueError("For training, you must specify `train_val_test_split`. "
|
262 |
+
"For testing/predicting, you must specify only `data_file` without "
|
263 |
+
"`train_val_test_split` or `split`.")
|
264 |
+
|
265 |
+
# this line allows to access init params with 'self.hparams' attribute
|
266 |
+
# also ensures init params will be stored in ckpt
|
267 |
+
self.save_hyperparameters(logger=False) # ignore=['split']
|
268 |
+
|
269 |
+
def prepare_data(self):
|
270 |
+
"""
|
271 |
+
Download data if needed.
|
272 |
+
Do not use it to assign state (e.g., self.x = x).
|
273 |
+
"""
|
274 |
+
|
275 |
+
def setup(self, stage: Optional[str] = None, encoding: str = None):
|
276 |
+
"""
|
277 |
+
Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
|
278 |
+
This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
|
279 |
+
careful not to execute data splitting twice.
|
280 |
+
"""
|
281 |
+
# TODO test SafeBatchSampler (which skips samples with any None without introducing variable batch size)
|
282 |
+
# load and split datasets only if not loaded in initialization
|
283 |
+
if not any([self.train_data, self.test_data, self.val_data, self.predict_data]):
|
284 |
+
self.train_data, self.val_data, self.test_data = self.split(
|
285 |
+
dataset=self.dataset(data_path=Path(self.hparams.data_dir, self.hparams.data_file)),
|
286 |
+
lengths=self.hparams.train_val_test_split
|
287 |
+
)
|
288 |
+
|
289 |
+
def train_dataloader(self):
|
290 |
+
return DataLoader(
|
291 |
+
dataset=self.train_data,
|
292 |
+
batch_sampler=SafeBatchSampler(
|
293 |
+
data_source=self.train_data,
|
294 |
+
batch_size=self.hparams.batch_size,
|
295 |
+
# Dropping the last batch prevents problems caused by variable batch sizes in training, e.g.,
|
296 |
+
# batch_size=1 in BatchNorm, and shuffling ensures the model be trained on all samples over epochs.
|
297 |
+
drop_last=True,
|
298 |
+
shuffle=True,
|
299 |
+
),
|
300 |
+
# batch_size=self.hparams.batch_size,
|
301 |
+
# shuffle=True,
|
302 |
+
num_workers=self.hparams.num_workers,
|
303 |
+
pin_memory=self.hparams.pin_memory,
|
304 |
+
collate_fn=self.collator,
|
305 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
306 |
+
)
|
307 |
+
|
308 |
+
def val_dataloader(self):
|
309 |
+
return DataLoader(
|
310 |
+
dataset=self.val_data,
|
311 |
+
batch_sampler=SafeBatchSampler(
|
312 |
+
data_source=self.val_data,
|
313 |
+
batch_size=self.hparams.batch_size,
|
314 |
+
drop_last=False,
|
315 |
+
shuffle=False
|
316 |
+
),
|
317 |
+
# batch_size=self.hparams.batch_size,
|
318 |
+
# shuffle=False,
|
319 |
+
num_workers=self.hparams.num_workers,
|
320 |
+
pin_memory=self.hparams.pin_memory,
|
321 |
+
collate_fn=self.collator,
|
322 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
323 |
+
)
|
324 |
+
|
325 |
+
def test_dataloader(self):
|
326 |
+
return DataLoader(
|
327 |
+
dataset=self.test_data,
|
328 |
+
batch_sampler=SafeBatchSampler(
|
329 |
+
data_source=self.test_data,
|
330 |
+
batch_size=self.hparams.batch_size,
|
331 |
+
drop_last=False,
|
332 |
+
shuffle=False
|
333 |
+
),
|
334 |
+
# batch_size=self.hparams.batch_size,
|
335 |
+
# shuffle=False,
|
336 |
+
num_workers=self.hparams.num_workers,
|
337 |
+
pin_memory=self.hparams.pin_memory,
|
338 |
+
collate_fn=self.collator,
|
339 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
340 |
+
)
|
341 |
+
|
342 |
+
def predict_dataloader(self):
|
343 |
+
return DataLoader(
|
344 |
+
dataset=self.predict_data,
|
345 |
+
batch_sampler=SafeBatchSampler(
|
346 |
+
data_source=self.predict_data,
|
347 |
+
batch_size=self.hparams.batch_size,
|
348 |
+
drop_last=False,
|
349 |
+
shuffle=False
|
350 |
+
),
|
351 |
+
# batch_size=self.hparams.batch_size,
|
352 |
+
# shuffle=False,
|
353 |
+
num_workers=self.hparams.num_workers,
|
354 |
+
pin_memory=self.hparams.pin_memory,
|
355 |
+
collate_fn=self.collator,
|
356 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
357 |
+
)
|
358 |
+
|
359 |
+
def teardown(self, stage: Optional[str] = None):
|
360 |
+
"""Clean up after fit or test."""
|
361 |
+
pass
|
362 |
+
|
363 |
+
def state_dict(self):
|
364 |
+
"""Extra things to save to checkpoint."""
|
365 |
+
return {}
|
366 |
+
|
367 |
+
def load_state_dict(self, state_dict: Dict[str, Any]):
|
368 |
+
"""Things to do when loading checkpoint."""
|
369 |
+
pass
|
deepscreen/data/dti_datamodule.py
ADDED
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from itertools import product
|
2 |
+
from collections import namedtuple
|
3 |
+
from numbers import Number
|
4 |
+
from typing import Any, Dict, Optional, Sequence, Union, Literal
|
5 |
+
|
6 |
+
# import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
from lightning import LightningDataModule
|
9 |
+
from torch.utils.data import Dataset, DataLoader, random_split
|
10 |
+
|
11 |
+
from deepscreen.data.utils.label import label_transform
|
12 |
+
from deepscreen.data.utils.collator import collate_fn
|
13 |
+
from deepscreen.data.utils.sampler import SafeBatchSampler
|
14 |
+
|
15 |
+
|
16 |
+
class DTIDataset(Dataset):
|
17 |
+
def __init__(
|
18 |
+
self,
|
19 |
+
task: Literal['regression', 'binary', 'multiclass'],
|
20 |
+
n_classes: Optional[int],
|
21 |
+
data_dir: str,
|
22 |
+
dataset_name: str,
|
23 |
+
drug_featurizer: callable,
|
24 |
+
protein_featurizer: callable,
|
25 |
+
thresholds: Optional[Union[Number, Sequence[Number]]] = None,
|
26 |
+
discard_intermediate: Optional[bool] = False,
|
27 |
+
):
|
28 |
+
df = pd.read_csv(
|
29 |
+
f'{data_dir}{dataset_name}.csv',
|
30 |
+
header=0, sep=',',
|
31 |
+
usecols=lambda x: x in ['X1', 'ID1', 'X2', 'ID2', 'Y', 'U'],
|
32 |
+
dtype={'X1': 'str', 'ID1': 'str',
|
33 |
+
'X2': 'str', 'ID2': 'str',
|
34 |
+
'Y': 'float32', 'U': 'str'}
|
35 |
+
)
|
36 |
+
# if 'ID1' in df:
|
37 |
+
# self.x1_to_id1 = dict(zip(df['X1'], df['ID1']))
|
38 |
+
# if 'ID2' in df:
|
39 |
+
# self.x2_to_id2 = dict(zip(df['X2'], df['ID2']))
|
40 |
+
# self.id2_to_indexes = dict(zip(df['ID2'], range(len(df['ID2']))))
|
41 |
+
# self.x2_to_indexes = dict(zip(df['X2'], range(len(df['X2']))))
|
42 |
+
|
43 |
+
# # train and eval mode data processing (fully labelled)
|
44 |
+
# if 'Y' in df.columns and df['Y'].notnull().all():
|
45 |
+
|
46 |
+
# Forward-fill all non-label columns
|
47 |
+
df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
|
48 |
+
|
49 |
+
if 'Y' in df:
|
50 |
+
# Transform labels
|
51 |
+
df['Y'] = df['Y'].apply(label_transform, units=df.get('U', None), thresholds=thresholds,
|
52 |
+
discard_intermediate=discard_intermediate).astype('float32')
|
53 |
+
|
54 |
+
# Filter out rows with a NaN in Y (missing values)
|
55 |
+
df.dropna(subset=['Y'], inplace=True)
|
56 |
+
|
57 |
+
# Validate target labels for training/testing
|
58 |
+
# TODO: check sklearn.utils.multiclass.check_classification_targets
|
59 |
+
match task:
|
60 |
+
case 'regression':
|
61 |
+
assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
|
62 |
+
f"Y for task `regression` must be numeric; got {set(df['Y'].apply(type))}."
|
63 |
+
case 'binary':
|
64 |
+
assert all(df['Y'].isin([0, 1])), \
|
65 |
+
f"Y for task `binary` (classification) must be 0 or 1, but Y got {pd.unique(df['Y'])}." \
|
66 |
+
"\nYou may set `thresholds` to discretize continuous labels."
|
67 |
+
case 'multiclass':
|
68 |
+
assert n_classes >= 3, f'n_classes for task `multiclass` (classification) must be at least 3.'
|
69 |
+
assert all(df['Y'].apply(lambda x: x.is_integer() and x >= 0)), \
|
70 |
+
f"Y for task `multiclass` (classification) must be non-negative integers, " \
|
71 |
+
f"but Y got {pd.unique(df['Y'])}." \
|
72 |
+
"\nYou may set `thresholds` to discretize continuous labels."
|
73 |
+
target_n_unique = df['Y'].nunique()
|
74 |
+
assert target_n_unique == n_classes, \
|
75 |
+
f"You have set n_classes for task `multiclass` (classification) task to {n_classes}, " \
|
76 |
+
f"but Y has {target_n_unique} unique labels."
|
77 |
+
|
78 |
+
# # Predict mode data processing
|
79 |
+
# else:
|
80 |
+
# df = pd.DataFrame(product(df['X1'].dropna(), df['X2'].dropna()), columns=['X1', 'X2'])
|
81 |
+
# if hasattr(self, "x1_to_id1"):
|
82 |
+
# df['ID1'] = df['X1'].map(self.x1_to_id1)
|
83 |
+
# if hasattr(self, "x1_to_id2"):
|
84 |
+
# df['ID2'] = df['X2'].map(self.x2_to_id2)
|
85 |
+
|
86 |
+
# self.smiles = df['X1']
|
87 |
+
# self.fasta = df['X2']
|
88 |
+
# self.smiles_ids = df.get('ID1', df['X1'])
|
89 |
+
# self.fasta_ids = df.get('ID2', df['X2'])
|
90 |
+
# self.labels = df.get('Y', None)
|
91 |
+
|
92 |
+
self.df = df
|
93 |
+
self.drug_featurizer = drug_featurizer if drug_featurizer is not None else (lambda x: x)
|
94 |
+
self.protein_featurizer = protein_featurizer if protein_featurizer is not None else (lambda x: x)
|
95 |
+
self.n_classes = df['Y'].nunique()
|
96 |
+
# self.train = train
|
97 |
+
|
98 |
+
self.Data = namedtuple('Data', ['FT1', 'ID1', 'FT2', 'ID2', 'Y'])
|
99 |
+
|
100 |
+
def __len__(self):
|
101 |
+
return len(self.df.index)
|
102 |
+
|
103 |
+
def __getitem__(self, idx):
|
104 |
+
sample = self.df.loc[idx]
|
105 |
+
return self.Data(
|
106 |
+
FT1=self.drug_featurizer(sample['X1']),
|
107 |
+
ID1=sample.get('ID1', sample['X1']),
|
108 |
+
FT2=self.protein_featurizer(sample['X2']),
|
109 |
+
ID2=sample.get('ID2', sample['X2']),
|
110 |
+
Y=sample.get('Y')
|
111 |
+
)
|
112 |
+
# {
|
113 |
+
# 'FT1': self.drug_featurizer(sample['X1']),
|
114 |
+
# 'ID1': sample.get('ID1', sample['X1']),
|
115 |
+
# 'FT2': self.protein_featurizer(sample['X2']),
|
116 |
+
# 'ID2': sample.get('ID2', sample['X2']),
|
117 |
+
# 'Y': sample.get('Y')
|
118 |
+
# }
|
119 |
+
# if self.train:
|
120 |
+
# sample = self.drug_featurizer(self.smiles[idx]), self.protein_featurizer(self.fasta[idx]), self.labels[idx]
|
121 |
+
# sample = {
|
122 |
+
# 'FT1': self.drug_featurizer(self.smiles[idx]),
|
123 |
+
# 'FT2': self.protein_featurizer(self.fasta[idx]),
|
124 |
+
# 'ID2': self.smiles_ids[idx],
|
125 |
+
# }
|
126 |
+
# else:
|
127 |
+
# # sample = self.drug_featurizer(self.smiles[idx]), self.protein_featurizer(self.fasta[idx])
|
128 |
+
# sample = {
|
129 |
+
# 'FT1': self.drug_featurizer(self.smiles[idx]),
|
130 |
+
# 'FT2': self.protein_featurizer(self.fasta[idx]),
|
131 |
+
# }
|
132 |
+
#
|
133 |
+
# if all([True if n is not None else False for n in sample.values()]):
|
134 |
+
# return sample # | {
|
135 |
+
# # 'ID1': self.smiles_ids[idx],
|
136 |
+
# # 'X1': self.drug_featurizer(self.smiles[idx]),
|
137 |
+
# # 'ID2': self.fasta_ids[idx],
|
138 |
+
# # 'X2': self.protein_featurizer(self.fasta[idx]),
|
139 |
+
# # }
|
140 |
+
# else:
|
141 |
+
# return self.__getitem__(np.random.randint(0, self.size))
|
142 |
+
|
143 |
+
|
144 |
+
class DTIdatamodule(LightningDataModule):
|
145 |
+
"""
|
146 |
+
DTI DataModule
|
147 |
+
|
148 |
+
A DataModule implements 5 key methods:
|
149 |
+
|
150 |
+
def prepare_data(self):
|
151 |
+
# things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
|
152 |
+
# download data, pre-process, split, save to disk, etc.
|
153 |
+
def setup(self, stage):
|
154 |
+
# things to do on every process in DDP
|
155 |
+
# load data, set variables, etc.
|
156 |
+
def train_dataloader(self):
|
157 |
+
# return train dataloader
|
158 |
+
def val_dataloader(self):
|
159 |
+
# return validation dataloader
|
160 |
+
def test_dataloader(self):
|
161 |
+
# return test dataloader
|
162 |
+
def teardown(self):
|
163 |
+
# called on every process in DDP
|
164 |
+
# clean up after fit or test
|
165 |
+
|
166 |
+
This allows you to share a full dataset without explaining how to download,
|
167 |
+
split, transform and process the data.
|
168 |
+
|
169 |
+
Read the docs:
|
170 |
+
https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
|
171 |
+
"""
|
172 |
+
|
173 |
+
def __init__(
|
174 |
+
self,
|
175 |
+
task: Literal['regression', 'binary', 'multiclass'],
|
176 |
+
n_classes: Optional[int],
|
177 |
+
train: bool,
|
178 |
+
drug_featurizer: callable,
|
179 |
+
protein_featurizer: callable,
|
180 |
+
batch_size: int,
|
181 |
+
train_val_test_split: Optional[Sequence[Number]],
|
182 |
+
num_workers: int = 0,
|
183 |
+
thresholds: Optional[Union[Number, Sequence[Number]]] = None,
|
184 |
+
pin_memory: bool = False,
|
185 |
+
data_dir: str = "data/",
|
186 |
+
dataset_name: Optional[str] = None,
|
187 |
+
split: Optional[callable] = random_split,
|
188 |
+
):
|
189 |
+
super().__init__()
|
190 |
+
|
191 |
+
# this line allows to access init params with 'self.hparams' attribute
|
192 |
+
# also ensures init params will be stored in ckpt
|
193 |
+
self.save_hyperparameters(logger=False)
|
194 |
+
|
195 |
+
# data processing
|
196 |
+
self.data_split = split
|
197 |
+
|
198 |
+
self.data_train: Optional[Dataset] = None
|
199 |
+
self.data_val: Optional[Dataset] = None
|
200 |
+
self.data_test: Optional[Dataset] = None
|
201 |
+
self.data_predict: Optional[Dataset] = None
|
202 |
+
|
203 |
+
def prepare_data(self):
|
204 |
+
"""
|
205 |
+
Download data if needed.
|
206 |
+
Do not use it to assign state (e.g., self.x = x).
|
207 |
+
"""
|
208 |
+
|
209 |
+
def setup(self, stage: Optional[str] = None, encoding: str = None):
|
210 |
+
"""
|
211 |
+
Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
|
212 |
+
This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
|
213 |
+
careful not to execute data splitting twice.
|
214 |
+
"""
|
215 |
+
# TODO test SafeBatchSampler (which skips samples with any None without introducing variable batch size)
|
216 |
+
# load and split datasets only if not loaded in initialization
|
217 |
+
if not any([self.data_train, self.data_val, self.data_test, self.data_predict]):
|
218 |
+
dataset = DTIDataset(
|
219 |
+
task=self.hparams.task,
|
220 |
+
n_classes=self.hparams.n_classes,
|
221 |
+
data_dir=self.hparams.data_dir,
|
222 |
+
drug_featurizer=self.hparams.drug_featurizer,
|
223 |
+
protein_featurizer=self.hparams.protein_featurizer,
|
224 |
+
dataset_name=self.hparams.dataset_name,
|
225 |
+
thresholds=self.hparams.thresholds,
|
226 |
+
)
|
227 |
+
|
228 |
+
if self.hparams.train:
|
229 |
+
self.data_train, self.data_val, self.data_test = self.data_split(
|
230 |
+
dataset=dataset,
|
231 |
+
lengths=self.hparams.train_val_test_split
|
232 |
+
)
|
233 |
+
else:
|
234 |
+
self.data_test = self.data_predict = dataset
|
235 |
+
|
236 |
+
def train_dataloader(self):
|
237 |
+
return DataLoader(
|
238 |
+
dataset=self.data_train,
|
239 |
+
batch_sampler=SafeBatchSampler(
|
240 |
+
data_source=self.data_train,
|
241 |
+
batch_size=self.hparams.batch_size,
|
242 |
+
drop_last=True,
|
243 |
+
shuffle=True,
|
244 |
+
),
|
245 |
+
# batch_size=self.hparams.batch_size,
|
246 |
+
# shuffle=True,
|
247 |
+
num_workers=self.hparams.num_workers,
|
248 |
+
pin_memory=self.hparams.pin_memory,
|
249 |
+
collate_fn=collate_fn,
|
250 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
251 |
+
)
|
252 |
+
|
253 |
+
def val_dataloader(self):
|
254 |
+
return DataLoader(
|
255 |
+
dataset=self.data_val,
|
256 |
+
batch_sampler=SafeBatchSampler(
|
257 |
+
data_source=self.data_val,
|
258 |
+
batch_size=self.hparams.batch_size,
|
259 |
+
drop_last=False,
|
260 |
+
shuffle=False,
|
261 |
+
),
|
262 |
+
# batch_size=self.hparams.batch_size,
|
263 |
+
# shuffle=False,
|
264 |
+
num_workers=self.hparams.num_workers,
|
265 |
+
pin_memory=self.hparams.pin_memory,
|
266 |
+
collate_fn=collate_fn,
|
267 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
268 |
+
)
|
269 |
+
|
270 |
+
def test_dataloader(self):
|
271 |
+
return DataLoader(
|
272 |
+
dataset=self.data_test,
|
273 |
+
batch_sampler=SafeBatchSampler(
|
274 |
+
data_source=self.data_test,
|
275 |
+
batch_size=self.hparams.batch_size,
|
276 |
+
drop_last=False,
|
277 |
+
shuffle=False,
|
278 |
+
),
|
279 |
+
# batch_size=self.hparams.batch_size,
|
280 |
+
# shuffle=False,
|
281 |
+
num_workers=self.hparams.num_workers,
|
282 |
+
pin_memory=self.hparams.pin_memory,
|
283 |
+
collate_fn=collate_fn,
|
284 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
285 |
+
)
|
286 |
+
|
287 |
+
def predict_dataloader(self):
|
288 |
+
return DataLoader(
|
289 |
+
dataset=self.data_predict,
|
290 |
+
batch_sampler=SafeBatchSampler(
|
291 |
+
data_source=self.data_predict,
|
292 |
+
batch_size=self.hparams.batch_size,
|
293 |
+
drop_last=False,
|
294 |
+
shuffle=False,
|
295 |
+
),
|
296 |
+
# batch_size=self.hparams.batch_size,
|
297 |
+
# shuffle=False,
|
298 |
+
num_workers=self.hparams.num_workers,
|
299 |
+
pin_memory=self.hparams.pin_memory,
|
300 |
+
collate_fn=collate_fn,
|
301 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
302 |
+
)
|
303 |
+
|
304 |
+
def teardown(self, stage: Optional[str] = None):
|
305 |
+
"""Clean up after fit or test."""
|
306 |
+
pass
|
307 |
+
|
308 |
+
def state_dict(self):
|
309 |
+
"""Extra things to save to checkpoint."""
|
310 |
+
return {}
|
311 |
+
|
312 |
+
def load_state_dict(self, state_dict: Dict[str, Any]):
|
313 |
+
"""Things to do when loading checkpoint."""
|
314 |
+
pass
|
deepscreen/data/entity_datamodule.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from numbers import Number
|
2 |
+
from pathlib import Path
|
3 |
+
from typing import Any, Dict, Optional, Sequence, Type
|
4 |
+
|
5 |
+
from lightning import LightningDataModule
|
6 |
+
from sklearn.base import TransformerMixin
|
7 |
+
from torch.utils.data import Dataset, DataLoader
|
8 |
+
|
9 |
+
from deepscreen.data.utils import collate_fn, SafeBatchSampler
|
10 |
+
from deepscreen.data.utils.dataset import BaseEntityDataset
|
11 |
+
|
12 |
+
|
13 |
+
class EntityDataModule(LightningDataModule):
|
14 |
+
"""
|
15 |
+
def prepare_data(self):
|
16 |
+
# things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
|
17 |
+
# download data, pre-process, split, save to disk, etc.
|
18 |
+
def setup(self, stage):
|
19 |
+
# things to do on every process in DDP
|
20 |
+
# load data, set variables, etc.
|
21 |
+
def train_dataloader(self):
|
22 |
+
# return train dataloader
|
23 |
+
def val_dataloader(self):
|
24 |
+
# return validation dataloader
|
25 |
+
def test_dataloader(self):
|
26 |
+
# return test dataloader
|
27 |
+
def teardown(self):
|
28 |
+
# called on every process in DDP
|
29 |
+
# clean up after fit or test
|
30 |
+
"""
|
31 |
+
def __init__(
|
32 |
+
self,
|
33 |
+
dataset: type[BaseEntityDataset],
|
34 |
+
transformer: type[TransformerMixin],
|
35 |
+
train: bool,
|
36 |
+
batch_size: int,
|
37 |
+
data_dir: str = "data/",
|
38 |
+
data_file: Optional[str] = None,
|
39 |
+
train_val_test_split: Optional[Sequence[Number], Sequence[str]] = None,
|
40 |
+
split: Optional[callable] = None,
|
41 |
+
num_workers: int = 0,
|
42 |
+
pin_memory: bool = False,
|
43 |
+
):
|
44 |
+
super().__init__()
|
45 |
+
|
46 |
+
# data processing
|
47 |
+
self.split = split
|
48 |
+
|
49 |
+
if train:
|
50 |
+
if all([data_file, split]):
|
51 |
+
if all(isinstance(split, Number) for split in train_val_test_split):
|
52 |
+
pass
|
53 |
+
else:
|
54 |
+
raise ValueError('`train_val_test_split` must be a sequence of 3 numbers '
|
55 |
+
'(float for percentages and int for sample numbers) if '
|
56 |
+
'`data_file` and `split` have been specified.')
|
57 |
+
elif all(isinstance(split, str) for split in train_val_test_split) and not any([data_file, split]):
|
58 |
+
self.train_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[0]))
|
59 |
+
self.val_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[1]))
|
60 |
+
self.test_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[2]))
|
61 |
+
else:
|
62 |
+
raise ValueError('For training (train=True), you must specify either '
|
63 |
+
'`dataset_name` and `split` with `train_val_test_split` of 3 numbers or '
|
64 |
+
'solely `train_val_test_split` of 3 data file names.')
|
65 |
+
else:
|
66 |
+
if data_file and not any([split, train_val_test_split]):
|
67 |
+
self.test_data = self.predict_data = dataset(dataset_path=str(Path(data_dir) / data_file))
|
68 |
+
else:
|
69 |
+
raise ValueError("For testing/predicting (train=False), you must specify only `data_file` without "
|
70 |
+
"`train_val_test_split` or `split`")
|
71 |
+
|
72 |
+
# this line allows to access init params with 'self.hparams' attribute
|
73 |
+
# also ensures init params will be stored in ckpt
|
74 |
+
self.save_hyperparameters(logger=False)
|
75 |
+
def prepare_data(self):
|
76 |
+
"""
|
77 |
+
Download data if needed.
|
78 |
+
Do not use it to assign state (e.g., self.x = x).
|
79 |
+
"""
|
80 |
+
|
81 |
+
def setup(self, stage: Optional[str] = None, encoding: str = None):
|
82 |
+
"""
|
83 |
+
Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
|
84 |
+
This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
|
85 |
+
careful not to execute data splitting twice.
|
86 |
+
"""
|
87 |
+
# TODO test SafeBatchSampler (which skips samples with any None without introducing variable batch size)
|
88 |
+
# TODO: find a way to apply transformer.fit_transform only to train and transformer.transform only to val, test
|
89 |
+
# load and split datasets only if not loaded in initialization
|
90 |
+
if not any([self.train_data, self.test_data, self.val_data, self.predict_data]):
|
91 |
+
self.train_data, self.val_data, self.test_data = self.split(
|
92 |
+
dataset=self.hparams.dataset(data_dir=self.hparams.data_dir,
|
93 |
+
dataset_name=self.hparams.train_dataset_name),
|
94 |
+
lengths=self.hparams.train_val_test_split
|
95 |
+
)
|
96 |
+
|
97 |
+
def train_dataloader(self):
|
98 |
+
return DataLoader(
|
99 |
+
dataset=self.train_data,
|
100 |
+
batch_sampler=SafeBatchSampler(
|
101 |
+
data_source=self.train_data,
|
102 |
+
batch_size=self.hparams.batch_size,
|
103 |
+
shuffle=True),
|
104 |
+
# batch_size=self.hparams.batch_size,
|
105 |
+
# shuffle=True,
|
106 |
+
num_workers=self.hparams.num_workers,
|
107 |
+
pin_memory=self.hparams.pin_memory,
|
108 |
+
collate_fn=collate_fn,
|
109 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
110 |
+
)
|
111 |
+
|
112 |
+
def val_dataloader(self):
|
113 |
+
return DataLoader(
|
114 |
+
dataset=self.val_data,
|
115 |
+
batch_sampler=SafeBatchSampler(
|
116 |
+
data_source=self.val_data,
|
117 |
+
batch_size=self.hparams.batch_size,
|
118 |
+
shuffle=False),
|
119 |
+
# batch_size=self.hparams.batch_size,
|
120 |
+
# shuffle=False,
|
121 |
+
num_workers=self.hparams.num_workers,
|
122 |
+
pin_memory=self.hparams.pin_memory,
|
123 |
+
collate_fn=collate_fn,
|
124 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
125 |
+
)
|
126 |
+
|
127 |
+
def test_dataloader(self):
|
128 |
+
return DataLoader(
|
129 |
+
dataset=self.test_data,
|
130 |
+
batch_sampler=SafeBatchSampler(
|
131 |
+
data_source=self.test_data,
|
132 |
+
batch_size=self.hparams.batch_size,
|
133 |
+
shuffle=False),
|
134 |
+
# batch_size=self.hparams.batch_size,
|
135 |
+
# shuffle=False,
|
136 |
+
num_workers=self.hparams.num_workers,
|
137 |
+
pin_memory=self.hparams.pin_memory,
|
138 |
+
collate_fn=collate_fn,
|
139 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
140 |
+
)
|
141 |
+
|
142 |
+
def predict_dataloader(self):
|
143 |
+
return DataLoader(
|
144 |
+
dataset=self.predict_data,
|
145 |
+
batch_sampler=SafeBatchSampler(
|
146 |
+
data_source=self.predict_data,
|
147 |
+
batch_size=self.hparams.batch_size,
|
148 |
+
shuffle=False),
|
149 |
+
# batch_size=self.hparams.batch_size,
|
150 |
+
# shuffle=False,
|
151 |
+
num_workers=self.hparams.num_workers,
|
152 |
+
pin_memory=self.hparams.pin_memory,
|
153 |
+
collate_fn=collate_fn,
|
154 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
155 |
+
)
|
156 |
+
|
157 |
+
def teardown(self, stage: Optional[str] = None):
|
158 |
+
"""Clean up after fit or test."""
|
159 |
+
pass
|
160 |
+
|
161 |
+
def state_dict(self):
|
162 |
+
"""Extra things to save to checkpoint."""
|
163 |
+
return {}
|
164 |
+
|
165 |
+
def load_state_dict(self, state_dict: Dict[str, Any]):
|
166 |
+
"""Things to do when loading checkpoint."""
|
167 |
+
pass
|
deepscreen/data/featurizers/__init__.py
ADDED
File without changes
|
deepscreen/data/featurizers/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (191 Bytes). View file
|
|
deepscreen/data/featurizers/__pycache__/categorical.cpython-311.pyc
ADDED
Binary file (5.6 kB). View file
|
|
deepscreen/data/featurizers/__pycache__/token.cpython-311.pyc
ADDED
Binary file (14.9 kB). View file
|
|
deepscreen/data/featurizers/categorical.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
# Sets of KNOWN characters in SMILES and FASTA sequences
|
4 |
+
# Use list instead of set to preserve character order
|
5 |
+
SMILES_VOCAB = ('#', '%', ')', '(', '+', '-', '.', '1', '0', '3', '2', '5', '4',
|
6 |
+
'7', '6', '9', '8', '=', 'A', 'C', 'B', 'E', 'D', 'G', 'F', 'I',
|
7 |
+
'H', 'K', 'M', 'L', 'O', 'N', 'P', 'S', 'R', 'U', 'T', 'W', 'V',
|
8 |
+
'Y', '[', 'Z', ']', '_', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i',
|
9 |
+
'h', 'm', 'l', 'o', 'n', 's', 'r', 'u', 't', 'y')
|
10 |
+
FASTA_VOCAB = ('A', 'C', 'B', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'O',
|
11 |
+
'N', 'Q', 'P', 'S', 'R', 'U', 'T', 'W', 'V', 'Y', 'X', 'Z')
|
12 |
+
|
13 |
+
# Check uniqueness, create character-index dicts, and add '?' for unknown characters as index 0
|
14 |
+
assert len(SMILES_VOCAB) == len(set(SMILES_VOCAB)), 'SMILES_CHARSET has duplicate characters.'
|
15 |
+
SMILES_CHARSET_IDX = {character: index+1 for index, character in enumerate(SMILES_VOCAB)} | {'?': 0}
|
16 |
+
|
17 |
+
assert len(FASTA_VOCAB) == len(set(FASTA_VOCAB)), 'FASTA_CHARSET has duplicate characters.'
|
18 |
+
FASTA_CHARSET_IDX = {character: index+1 for index, character in enumerate(FASTA_VOCAB)} | {'?': 0}
|
19 |
+
|
20 |
+
|
21 |
+
def sequence_to_onehot(sequence: str, charset, max_sequence_length: int):
|
22 |
+
assert len(charset) == len(set(charset)), '`charset` contains duplicate characters.'
|
23 |
+
charset_idx = {character: index+1 for index, character in enumerate(charset)} | {'?': 0}
|
24 |
+
|
25 |
+
onehot = np.zeros((max_sequence_length, len(charset_idx)), dtype=int)
|
26 |
+
for index, character in enumerate(sequence[:max_sequence_length]):
|
27 |
+
onehot[index, charset_idx.get(character, 0)] = 1
|
28 |
+
|
29 |
+
return onehot.transpose()
|
30 |
+
|
31 |
+
|
32 |
+
def sequence_to_label(sequence: str, charset, max_sequence_length: int):
|
33 |
+
assert len(charset) == len(set(charset)), '`charset` contains duplicate characters.'
|
34 |
+
charset_idx = {character: index+1 for index, character in enumerate(charset)} | {'?': 0}
|
35 |
+
|
36 |
+
label = np.zeros(max_sequence_length, dtype=int)
|
37 |
+
for index, character in enumerate(sequence[:max_sequence_length]):
|
38 |
+
label[index] = charset_idx.get(character, 0)
|
39 |
+
|
40 |
+
return label
|
41 |
+
|
42 |
+
|
43 |
+
def smiles_to_onehot(smiles: str, smiles_charset=SMILES_VOCAB, max_sequence_length: int = 100): # , in_channels: int = len(SMILES_CHARSET)
|
44 |
+
# assert len(SMILES_CHARSET) == len(set(SMILES_CHARSET)), 'SMILES_CHARSET has duplicate characters.'
|
45 |
+
# onehot = np.zeros((max_sequence_length, len(SMILES_CHARSET_IDX)))
|
46 |
+
# for index, character in enumerate(smiles[:max_sequence_length]):
|
47 |
+
# onehot[index, SMILES_CHARSET_IDX.get(character, 0)] = 1
|
48 |
+
# return onehot.transpose()
|
49 |
+
return sequence_to_onehot(smiles, smiles_charset, max_sequence_length)
|
50 |
+
|
51 |
+
|
52 |
+
def smiles_to_label(smiles: str, smiles_charset=SMILES_VOCAB, max_sequence_length: int = 100): # , in_channels: int = len(SMILES_CHARSET)
|
53 |
+
# label = np.zeros(max_sequence_length)
|
54 |
+
# for index, character in enumerate(smiles[:max_sequence_length]):
|
55 |
+
# label[index] = SMILES_CHARSET_IDX.get(character, 0)
|
56 |
+
# return label
|
57 |
+
return sequence_to_label(smiles, smiles_charset, max_sequence_length)
|
58 |
+
|
59 |
+
|
60 |
+
def fasta_to_onehot(fasta: str, fasta_charset=FASTA_VOCAB, max_sequence_length: int = 1000): # in_channels: int = len(FASTA_CHARSET)
|
61 |
+
# onehot = np.zeros((max_sequence_length, len(FASTA_CHARSET_IDX)))
|
62 |
+
# for index, character in enumerate(fasta[:max_sequence_length]):
|
63 |
+
# onehot[index, FASTA_CHARSET_IDX.get(character, 0)] = 1
|
64 |
+
# return onehot.transpose()
|
65 |
+
return sequence_to_onehot(fasta, fasta_charset, max_sequence_length)
|
66 |
+
|
67 |
+
|
68 |
+
def fasta_to_label(fasta: str, fasta_charset=FASTA_VOCAB, max_sequence_length: int = 1000): # in_channels: int = len(FASTA_CHARSET)
|
69 |
+
# label = np.zeros(max_sequence_length)
|
70 |
+
# for index, character in enumerate(fasta[:max_sequence_length]):
|
71 |
+
# label[index] = FASTA_CHARSET_IDX.get(character, 0)
|
72 |
+
# return label
|
73 |
+
return sequence_to_label(fasta, fasta_charset, max_sequence_length)
|
74 |
+
|
75 |
+
|
76 |
+
def one_of_k_encoding(x, allowable_set):
|
77 |
+
if x not in allowable_set:
|
78 |
+
raise Exception("input {0} not in allowable set{1}:".format(x, allowable_set))
|
79 |
+
return list(map(lambda s: x == s, allowable_set))
|
80 |
+
|
81 |
+
|
82 |
+
def one_of_k_encoding_unk(x, allowable_set):
|
83 |
+
"""Maps inputs not in the allowable set to the last element."""
|
84 |
+
if x not in allowable_set:
|
85 |
+
x = allowable_set[-1]
|
86 |
+
return list(map(lambda s: x == s, allowable_set))
|
deepscreen/data/featurizers/chem.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Mainly adapted from MolMap:
|
3 |
+
https://github.com/shenwanxiang/bidd-molmap/tree/master/molmap/feature/fingerprint
|
4 |
+
"""
|
5 |
+
import numpy as np
|
6 |
+
from rdkit import Chem, DataStructs
|
7 |
+
from rdkit.Chem import AllChem
|
8 |
+
from rdkit.Chem.Fingerprints import FingerprintMols
|
9 |
+
from rdkit.Chem.rdReducedGraphs import GetErGFingerprint
|
10 |
+
|
11 |
+
from deepscreen import get_logger
|
12 |
+
|
13 |
+
log = get_logger(__name__)
|
14 |
+
|
15 |
+
|
16 |
+
def smiles_to_erg(smiles):
|
17 |
+
try:
|
18 |
+
mol = Chem.MolFromSmiles(smiles)
|
19 |
+
features = np.array(GetErGFingerprint(mol), dtype=bool)
|
20 |
+
return features
|
21 |
+
except Exception as e:
|
22 |
+
log.warning(f"Failed to convert SMILES ({smiles}) to ErGFP due to {str(e)}")
|
23 |
+
return None
|
24 |
+
|
25 |
+
|
26 |
+
def smiles_to_morgan(smiles, radius=2, n_bits=1024):
|
27 |
+
try:
|
28 |
+
mol = Chem.MolFromSmiles(smiles)
|
29 |
+
features_vec = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=n_bits)
|
30 |
+
features = np.zeros((1,))
|
31 |
+
DataStructs.ConvertToNumpyArray(features_vec, features)
|
32 |
+
except Exception as e:
|
33 |
+
log.warning(f"Failed to convert SMILES ({smiles}) to ErGFP due to {str(e)}")
|
34 |
+
return None
|
35 |
+
|
36 |
+
|
37 |
+
def smiles_to_daylight(smiles):
|
38 |
+
try:
|
39 |
+
NumFinger = 2048
|
40 |
+
mol = Chem.MolFromSmiles(smiles)
|
41 |
+
bv = FingerprintMols.FingerprintMol(mol)
|
42 |
+
temp = tuple(bv.GetOnBits())
|
43 |
+
features = np.zeros((NumFinger,))
|
44 |
+
features[np.array(temp)] = 1
|
45 |
+
except:
|
46 |
+
print(f'RDKit could not find this SMILES: {smiles} convert to all 0 features')
|
47 |
+
features = np.zeros((2048,))
|
48 |
+
return features.astype(int)
|
deepscreen/data/featurizers/fcs.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from importlib import resources
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
from subword_nmt.apply_bpe import BPE
|
6 |
+
import codecs
|
7 |
+
|
8 |
+
vocab_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/protein_codes_uniprot.txt')
|
9 |
+
bpe_codes_protein = codecs.open(vocab_path)
|
10 |
+
protein_bpe = BPE(bpe_codes_protein, merges=-1, separator='')
|
11 |
+
|
12 |
+
sub_csv_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/subword_units_map_uniprot.csv')
|
13 |
+
sub_csv = pd.read_csv(sub_csv_path)
|
14 |
+
idx2word_protein = sub_csv['index'].values
|
15 |
+
words2idx_protein = dict(zip(idx2word_protein, range(0, len(idx2word_protein))))
|
16 |
+
|
17 |
+
vocab_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/drug_codes_chembl.txt')
|
18 |
+
bpe_codes_drug = codecs.open(vocab_path)
|
19 |
+
drug_bpe = BPE(bpe_codes_drug, merges=-1, separator='')
|
20 |
+
|
21 |
+
sub_csv_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/subword_units_map_chembl.csv')
|
22 |
+
sub_csv = pd.read_csv(sub_csv_path)
|
23 |
+
idx2word_drug = sub_csv['index'].values
|
24 |
+
words2idx_drug = dict(zip(idx2word_drug, range(0, len(idx2word_drug))))
|
25 |
+
|
26 |
+
|
27 |
+
def protein_to_embedding(x, max_sequence_length):
|
28 |
+
max_p = max_sequence_length
|
29 |
+
t1 = protein_bpe.process_line(x).split() # split
|
30 |
+
try:
|
31 |
+
i1 = np.asarray([words2idx_protein[i] for i in t1]) # index
|
32 |
+
except:
|
33 |
+
i1 = np.array([0])
|
34 |
+
# print(x)
|
35 |
+
|
36 |
+
l = len(i1)
|
37 |
+
|
38 |
+
if l < max_p:
|
39 |
+
i = np.pad(i1, (0, max_p - l), 'constant', constant_values=0)
|
40 |
+
input_mask = ([1] * l) + ([0] * (max_p - l))
|
41 |
+
else:
|
42 |
+
i = i1[:max_p]
|
43 |
+
input_mask = [1] * max_p
|
44 |
+
|
45 |
+
return i, np.asarray(input_mask)
|
46 |
+
|
47 |
+
|
48 |
+
def drug_to_embedding(x, max_sequence_length):
|
49 |
+
max_d = max_sequence_length
|
50 |
+
t1 = drug_bpe.process_line(x).split() # split
|
51 |
+
try:
|
52 |
+
i1 = np.asarray([words2idx_drug[i] for i in t1]) # index
|
53 |
+
except:
|
54 |
+
i1 = np.array([0])
|
55 |
+
# print(x)
|
56 |
+
|
57 |
+
l = len(i1)
|
58 |
+
|
59 |
+
if l < max_d:
|
60 |
+
i = np.pad(i1, (0, max_d - l), 'constant', constant_values=0)
|
61 |
+
input_mask = ([1] * l) + ([0] * (max_d - l))
|
62 |
+
|
63 |
+
else:
|
64 |
+
i = i1[:max_d]
|
65 |
+
input_mask = [1] * max_d
|
66 |
+
|
67 |
+
return i, np.asarray(input_mask)
|
deepscreen/data/featurizers/fingerprint/__init__.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Literal
|
2 |
+
|
3 |
+
from .atompairs import GetAtomPairFPs
|
4 |
+
from .avalonfp import GetAvalonFPs
|
5 |
+
from .rdkitfp import GetRDkitFPs
|
6 |
+
from .morganfp import GetMorganFPs
|
7 |
+
from .estatefp import GetEstateFPs
|
8 |
+
from .maccskeys import GetMACCSFPs
|
9 |
+
from .pharmErGfp import GetPharmacoErGFPs
|
10 |
+
from .pharmPointfp import GetPharmacoPFPs
|
11 |
+
from .pubchemfp import GetPubChemFPs
|
12 |
+
from .torsions import GetTorsionFPs
|
13 |
+
from .mhfp6 import GetMHFP6
|
14 |
+
# from .map4 import GetMAP4
|
15 |
+
from rdkit import Chem
|
16 |
+
|
17 |
+
from deepscreen import get_logger
|
18 |
+
|
19 |
+
log = get_logger(__name__)
|
20 |
+
|
21 |
+
FP_MAP = {
|
22 |
+
'MorganFP': GetMorganFPs,
|
23 |
+
'RDkitFP': GetRDkitFPs,
|
24 |
+
'AtomPairFP': GetAtomPairFPs,
|
25 |
+
'TorsionFP': GetTorsionFPs,
|
26 |
+
'AvalonFP': GetAvalonFPs,
|
27 |
+
'EstateFP': GetEstateFPs,
|
28 |
+
'MACCSFP': GetMACCSFPs,
|
29 |
+
'PharmacoErGFP': GetPharmacoErGFPs,
|
30 |
+
'PharmacoPFP': GetPharmacoPFPs,
|
31 |
+
'PubChemFP': GetPubChemFPs,
|
32 |
+
'MHFP6': GetMHFP6,
|
33 |
+
# 'MAP4': GetMAP4,
|
34 |
+
}
|
35 |
+
|
36 |
+
|
37 |
+
def smiles_to_fingerprint(smiles, fingerprint: Literal[tuple(FP_MAP.keys())], **kwargs):
|
38 |
+
func = FP_MAP[fingerprint]
|
39 |
+
try:
|
40 |
+
mol = Chem.MolFromSmiles(smiles)
|
41 |
+
arr = func(mol, **kwargs)
|
42 |
+
return arr
|
43 |
+
except Exception as e:
|
44 |
+
log.warning(f"Failed to convert SMILES ({smiles}) to {fingerprint} due to {str(e)}")
|
45 |
+
return None
|
deepscreen/data/featurizers/fingerprint/atompairs.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from rdkit.Chem.AtomPairs import Pairs
|
2 |
+
from rdkit.Chem import DataStructs
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
_type = 'topological-based'
|
6 |
+
|
7 |
+
|
8 |
+
def GetAtomPairFPs(mol, nBits=2048, binary=True):
|
9 |
+
'''
|
10 |
+
atompairs fingerprints
|
11 |
+
'''
|
12 |
+
fp = Pairs.GetHashedAtomPairFingerprint(mol, nBits=nBits)
|
13 |
+
if binary:
|
14 |
+
arr = np.zeros((0,), dtype=np.bool_)
|
15 |
+
else:
|
16 |
+
arr = np.zeros((0,), dtype=np.int8)
|
17 |
+
DataStructs.ConvertToNumpyArray(fp, arr)
|
18 |
+
return arr
|
deepscreen/data/featurizers/fingerprint/avalonfp.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from rdkit.Chem import DataStructs
|
2 |
+
from rdkit.Avalon.pyAvalonTools import GetAvalonFP as GAFP
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
_type = 'topological-based'
|
6 |
+
|
7 |
+
|
8 |
+
def GetAvalonFPs(mol, nBits=2048):
|
9 |
+
'''
|
10 |
+
Avalon_fingerprints: https://pubs.acs.org/doi/pdf/10.1021/ci050413p
|
11 |
+
'''
|
12 |
+
|
13 |
+
fp = GAFP(mol, nBits=nBits)
|
14 |
+
arr = np.zeros((0,), dtype=np.bool_)
|
15 |
+
DataStructs.ConvertToNumpyArray(fp, arr)
|
16 |
+
return arr
|
deepscreen/data/featurizers/fingerprint/estatefp.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from rdkit.Chem.EState import Fingerprinter
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
_type = 'Estate-based'
|
5 |
+
|
6 |
+
|
7 |
+
def GetEstateFPs(mol):
|
8 |
+
'''
|
9 |
+
79 bits Estate fps
|
10 |
+
'''
|
11 |
+
x = Fingerprinter.FingerprintMol(mol)[0]
|
12 |
+
return x.astype(np.bool_)
|
deepscreen/data/featurizers/fingerprint/maccskeys.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from rdkit.Chem import AllChem
|
2 |
+
from rdkit.Chem import DataStructs
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
import os
|
6 |
+
|
7 |
+
_type = 'SMARTS-based'
|
8 |
+
|
9 |
+
file_path = os.path.dirname(__file__)
|
10 |
+
|
11 |
+
|
12 |
+
def GetMACCSFPs(mol):
|
13 |
+
'''
|
14 |
+
166 bits
|
15 |
+
'''
|
16 |
+
|
17 |
+
fp = AllChem.GetMACCSKeysFingerprint(mol)
|
18 |
+
|
19 |
+
arr = np.zeros((0,), dtype=np.bool_)
|
20 |
+
DataStructs.ConvertToNumpyArray(fp, arr)
|
21 |
+
return arr
|
22 |
+
|
23 |
+
|
24 |
+
def GetMACCSFPInfos():
|
25 |
+
return pd.read_excel(os.path.join(file_path, 'maccskeys.xlsx'))
|
deepscreen/data/featurizers/fingerprint/maccskeys.xlsx
ADDED
Binary file (14 kB). View file
|
|
deepscreen/data/featurizers/fingerprint/map4.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
MinHashed Atom-pair Fingerprint, MAP
|
3 |
+
orignal paper: Capecchi, Alice, Daniel Probst, and Jean-Louis Reymond. "One molecular fingerprint to rule them all: drugs, biomolecules, and the metabolome." Journal of Cheminformatics 12.1 (2020): 1-15. orignal code: https://github.com/reymond-group/map4, thanks their orignal work
|
4 |
+
|
5 |
+
A small bug is fixed: https://github.com/reymond-group/map4/issues/6
|
6 |
+
"""
|
7 |
+
|
8 |
+
_type = 'topological-based'
|
9 |
+
|
10 |
+
import itertools
|
11 |
+
from collections import defaultdict
|
12 |
+
|
13 |
+
import tmap as tm
|
14 |
+
from mhfp.encoder import MHFPEncoder
|
15 |
+
from rdkit import Chem
|
16 |
+
from rdkit.Chem import rdmolops
|
17 |
+
from rdkit.Chem.rdmolops import GetDistanceMatrix
|
18 |
+
|
19 |
+
|
20 |
+
def to_smiles(mol):
|
21 |
+
return Chem.MolToSmiles(mol, canonical=True, isomericSmiles=False)
|
22 |
+
|
23 |
+
|
24 |
+
class MAP4Calculator:
|
25 |
+
def __init__(self, dimensions=2048, radius=2, is_counted=False, is_folded=False, fold_dimensions=2048):
|
26 |
+
"""
|
27 |
+
MAP4 calculator class
|
28 |
+
"""
|
29 |
+
self.dimensions = dimensions
|
30 |
+
self.radius = radius
|
31 |
+
self.is_counted = is_counted
|
32 |
+
self.is_folded = is_folded
|
33 |
+
self.fold_dimensions = fold_dimensions
|
34 |
+
|
35 |
+
if self.is_folded:
|
36 |
+
self.encoder = MHFPEncoder(dimensions)
|
37 |
+
else:
|
38 |
+
self.encoder = tm.Minhash(dimensions)
|
39 |
+
|
40 |
+
def calculate(self, mol):
|
41 |
+
"""Calculates the atom pair minhashed fingerprint
|
42 |
+
Arguments:
|
43 |
+
mol -- rdkit mol object
|
44 |
+
Returns:
|
45 |
+
tmap VectorUint -- minhashed fingerprint
|
46 |
+
"""
|
47 |
+
|
48 |
+
atom_env_pairs = self._calculate(mol)
|
49 |
+
if self.is_folded:
|
50 |
+
return self._fold(atom_env_pairs)
|
51 |
+
return self.encoder.from_string_array(atom_env_pairs)
|
52 |
+
|
53 |
+
def calculate_many(self, mols):
|
54 |
+
""" Calculates the atom pair minhashed fingerprint
|
55 |
+
Arguments:
|
56 |
+
mols -- list of mols
|
57 |
+
Returns:
|
58 |
+
list of tmap VectorUint -- minhashed fingerprints list
|
59 |
+
"""
|
60 |
+
|
61 |
+
atom_env_pairs_list = [self._calculate(mol) for mol in mols]
|
62 |
+
if self.is_folded:
|
63 |
+
return [self._fold(pairs) for pairs in atom_env_pairs_list]
|
64 |
+
return self.encoder.batch_from_string_array(atom_env_pairs_list)
|
65 |
+
|
66 |
+
def _calculate(self, mol):
|
67 |
+
return self._all_pairs(mol, self._get_atom_envs(mol))
|
68 |
+
|
69 |
+
def _fold(self, pairs):
|
70 |
+
fp_hash = self.encoder.hash(set(pairs))
|
71 |
+
return self.encoder.fold(fp_hash, self.fold_dimensions)
|
72 |
+
|
73 |
+
def _get_atom_envs(self, mol):
|
74 |
+
atoms_env = {}
|
75 |
+
for atom in mol.GetAtoms():
|
76 |
+
idx = atom.GetIdx()
|
77 |
+
for radius in range(1, self.radius + 1):
|
78 |
+
if idx not in atoms_env:
|
79 |
+
atoms_env[idx] = []
|
80 |
+
atoms_env[idx].append(MAP4Calculator._find_env(mol, idx, radius))
|
81 |
+
return atoms_env
|
82 |
+
|
83 |
+
@classmethod
|
84 |
+
def _find_env(cls, mol, idx, radius):
|
85 |
+
env = rdmolops.FindAtomEnvironmentOfRadiusN(mol, radius, idx)
|
86 |
+
atom_map = {}
|
87 |
+
|
88 |
+
submol = Chem.PathToSubmol(mol, env, atomMap=atom_map)
|
89 |
+
if idx in atom_map:
|
90 |
+
smiles = Chem.MolToSmiles(submol, rootedAtAtom=atom_map[idx], canonical=True, isomericSmiles=False)
|
91 |
+
return smiles
|
92 |
+
return ''
|
93 |
+
|
94 |
+
def _all_pairs(self, mol, atoms_env):
|
95 |
+
atom_pairs = []
|
96 |
+
distance_matrix = GetDistanceMatrix(mol)
|
97 |
+
num_atoms = mol.GetNumAtoms()
|
98 |
+
shingle_dict = defaultdict(int)
|
99 |
+
for idx1, idx2 in itertools.combinations(range(num_atoms), 2):
|
100 |
+
dist = str(int(distance_matrix[idx1][idx2]))
|
101 |
+
|
102 |
+
for i in range(self.radius):
|
103 |
+
env_a = atoms_env[idx1][i]
|
104 |
+
env_b = atoms_env[idx2][i]
|
105 |
+
|
106 |
+
ordered = sorted([env_a, env_b])
|
107 |
+
|
108 |
+
shingle = '{}|{}|{}'.format(ordered[0], dist, ordered[1])
|
109 |
+
|
110 |
+
if self.is_counted:
|
111 |
+
shingle_dict[shingle] += 1
|
112 |
+
shingle += '|' + str(shingle_dict[shingle])
|
113 |
+
|
114 |
+
atom_pairs.append(shingle.encode('utf-8'))
|
115 |
+
return list(set(atom_pairs))
|
116 |
+
|
117 |
+
|
118 |
+
def GetMAP4(mol, nBits=2048, radius=2, fold_dimensions=None):
|
119 |
+
"""
|
120 |
+
MAP4: radius=2
|
121 |
+
"""
|
122 |
+
if fold_dimensions == None:
|
123 |
+
fold_dimensions = nBits
|
124 |
+
|
125 |
+
calc = MAP4Calculator(dimensions=nBits, radius=radius, is_counted=False, is_folded=True,
|
126 |
+
fold_dimensions=fold_dimensions)
|
127 |
+
|
128 |
+
arr = calc.calculate(mol)
|
129 |
+
|
130 |
+
return arr.astype(bool)
|
deepscreen/data/featurizers/fingerprint/mhfp6.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Probst, Daniel, and Jean-Louis Reymond. "A probabilistic molecular fingerprint for big data settings." Journal of cheminformatics 10.1 (2018): 66.'
|
3 |
+
|
4 |
+
orignal code: https://github.com/reymond-group/mhfp
|
5 |
+
|
6 |
+
"""
|
7 |
+
|
8 |
+
from mhfp.encoder import MHFPEncoder
|
9 |
+
|
10 |
+
|
11 |
+
def GetMHFP6(mol, nBits=2048, radius=3):
|
12 |
+
"""
|
13 |
+
MHFP6: radius=3
|
14 |
+
"""
|
15 |
+
encoder = MHFPEncoder(n_permutations=nBits)
|
16 |
+
hash_values = encoder.encode_mol(mol, radius=radius, rings=True, kekulize=True, min_radius=1)
|
17 |
+
arr = encoder.fold(hash_values, nBits)
|
18 |
+
return arr.astype(bool)
|
deepscreen/data/featurizers/fingerprint/mnimalfatures.fdef
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AtomType NDonor [N&!H0&v3,N&!H0&+1&v4,n&H1&+0]
|
2 |
+
AtomType ChalcDonor [O,S;H1;+0]
|
3 |
+
DefineFeature SingleAtomDonor [{NDonor},{ChalcDonor},!$([D1]-[C;D3]=[O,S,N])]
|
4 |
+
Family Donor
|
5 |
+
Weights 1
|
6 |
+
EndFeature
|
7 |
+
|
8 |
+
AtomType NAcceptor [$([N&v3;H1,H2]-[!$(*=[O,N,P,S])])]
|
9 |
+
Atomtype NAcceptor [$([N;v3;H0])]
|
10 |
+
AtomType NAcceptor [$([n;+0])]
|
11 |
+
AtomType ChalcAcceptor [$([O,S;H1;v2]-[!$(*=[O,N,P,S])])]
|
12 |
+
AtomType ChalcAcceptor [O,S;H0;v2]
|
13 |
+
Atomtype ChalcAcceptor [O,S;-]
|
14 |
+
Atomtype ChalcAcceptor [o,s;+0]
|
15 |
+
AtomType HalogenAcceptor [F]
|
16 |
+
DefineFeature SingleAtomAcceptor [{NAcceptor},{ChalcAcceptor},{HalogenAcceptor}]
|
17 |
+
Family Acceptor
|
18 |
+
Weights 1
|
19 |
+
EndFeature
|
20 |
+
|
21 |
+
# this one is delightfully easy:
|
22 |
+
DefineFeature AcidicGroup [C,S](=[O,S,P])-[O;H1,H0&-1]
|
23 |
+
Family NegIonizable
|
24 |
+
Weights 1.0,1.0,1.0
|
25 |
+
EndFeature
|
26 |
+
|
27 |
+
AtomType CarbonOrArom_NonCarbonyl [$([C,a]);!$([C,a](=O))]
|
28 |
+
AtomType BasicNH2 [$([N;H2&+0][{CarbonOrArom_NonCarbonyl}])]
|
29 |
+
AtomType BasicNH1 [$([N;H1&+0]([{CarbonOrArom_NonCarbonyl}])[{CarbonOrArom_NonCarbonyl}])]
|
30 |
+
AtomType BasicNH0 [$([N;H0&+0]([{CarbonOrArom_NonCarbonyl}])([{CarbonOrArom_NonCarbonyl}])[{CarbonOrArom_NonCarbonyl}])]
|
31 |
+
AtomType BasicNakedN [N,n;X2;+0]
|
32 |
+
DefineFeature BasicGroup [{BasicNH2},{BasicNH1},{BasicNH0},{BasicNakedN}]
|
33 |
+
Family PosIonizable
|
34 |
+
Weights 1.0
|
35 |
+
EndFeature
|
36 |
+
|
37 |
+
# aromatic rings of various sizes:
|
38 |
+
DefineFeature Arom5 a1aaaa1
|
39 |
+
Family Aromatic
|
40 |
+
Weights 1.0,1.0,1.0,1.0,1.0
|
41 |
+
EndFeature
|
42 |
+
DefineFeature Arom6 a1aaaaa1
|
43 |
+
Family Aromatic
|
44 |
+
Weights 1.0,1.0,1.0,1.0,1.0,1.0
|
45 |
+
EndFeature
|
46 |
+
DefineFeature Arom7 a1aaaaaa1
|
47 |
+
Family Aromatic
|
48 |
+
Weights 1.0,1.0,1.0,1.0,1.0,1.0,1.0
|
49 |
+
EndFeature
|
50 |
+
DefineFeature Arom8 a1aaaaaaa1
|
51 |
+
Family Aromatic
|
52 |
+
Weights 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
|
53 |
+
EndFeature
|
deepscreen/data/featurizers/fingerprint/morganfp.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from rdkit.Chem import AllChem
|
2 |
+
from rdkit.Chem import DataStructs
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
def GetMorganFPs(mol, nBits=2048, radius=2, return_bitInfo=False):
|
7 |
+
"""
|
8 |
+
ECFP4: radius=2
|
9 |
+
"""
|
10 |
+
bitInfo = {}
|
11 |
+
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius,
|
12 |
+
bitInfo=bitInfo, nBits=nBits)
|
13 |
+
arr = np.zeros((0,), dtype=np.bool_)
|
14 |
+
DataStructs.ConvertToNumpyArray(fp, arr)
|
15 |
+
|
16 |
+
if return_bitInfo:
|
17 |
+
return arr, bitInfo
|
18 |
+
return arr
|
deepscreen/data/featurizers/fingerprint/pharmErGfp.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Created on Sat Aug 17 16:54:12 2019
|
5 |
+
|
6 |
+
@author: wanxiang.shen@u.nus.edu
|
7 |
+
|
8 |
+
@calculate ErG fps, more info: https://pubs.acs.org/doi/full/10.1021/ci050457y#
|
9 |
+
"""
|
10 |
+
|
11 |
+
_type = 'Pharmacophore-based'
|
12 |
+
|
13 |
+
import numpy as np
|
14 |
+
from rdkit.Chem import AllChem
|
15 |
+
|
16 |
+
## get info from : https://github.com/rdkit/rdkit/blob/d41752d558bf7200ab67b98cdd9e37f1bdd378de/Code/GraphMol/ReducedGraphs/ReducedGraphs.cpp
|
17 |
+
Donor = ["[N;!H0;v3,v4&+1]", "[O,S;H1;+0]", "[n&H1&+0]"]
|
18 |
+
|
19 |
+
Acceptor = ["[O,S;H1;v2;!$(*-*=[O,N,P,S])]", "[O;H0;v2]", "[O,S;v1;-]",
|
20 |
+
"[N;v3;!$(N-*=[O,N,P,S])]", "[n&H0&+0]", "[o;+0;!$([o]:n);!$([o]:c:n)]"]
|
21 |
+
|
22 |
+
Positive = ["[#7;+]", "[N;H2&+0][$([C,a]);!$([C,a](=O))]",
|
23 |
+
"[N;H1&+0]([$([C,a]);!$([C,a](=O))])[$([C,a]);!$([C,a](=O))]",
|
24 |
+
"[N;H0&+0]([C;!$(C(=O))])([C;!$(C(=O))])[C;!$(C(=O))]"]
|
25 |
+
|
26 |
+
Negative = ["[C,S](=[O,S,P])-[O;H1,-1]"]
|
27 |
+
|
28 |
+
Hydrophobic = ["[C;D3,D4](-[CH3])-[CH3]", "[S;D2](-C)-C"]
|
29 |
+
|
30 |
+
Aromatic = ["a"]
|
31 |
+
|
32 |
+
PROPERTY_KEY = ["Donor", "Acceptor", "Positive", "Negative", "Hydrophobic", "Aromatic"]
|
33 |
+
|
34 |
+
|
35 |
+
def GetPharmacoErGFPs(mol, fuzzIncrement=0.3, maxPath=21, binary=True, return_bitInfo=False):
|
36 |
+
'''
|
37 |
+
https://pubs.acs.org/doi/full/10.1021/ci050457y#
|
38 |
+
return maxPath*21 bits
|
39 |
+
|
40 |
+
size(v) = (n(n + 1)/2) * (maxDist - minDist + 1)
|
41 |
+
|
42 |
+
'''
|
43 |
+
minPath = 1
|
44 |
+
|
45 |
+
arr = AllChem.GetErGFingerprint(mol, fuzzIncrement=fuzzIncrement, maxPath=maxPath, minPath=minPath)
|
46 |
+
arr = arr.astype(np.float32)
|
47 |
+
|
48 |
+
if binary:
|
49 |
+
arr = arr.astype(np.bool_)
|
50 |
+
|
51 |
+
if return_bitInfo:
|
52 |
+
bitInfo = []
|
53 |
+
for i in range(len(PROPERTY_KEY)):
|
54 |
+
for j in range(i, len(PROPERTY_KEY)):
|
55 |
+
for path in range(minPath, maxPath + 1):
|
56 |
+
triplet = (PROPERTY_KEY[i], PROPERTY_KEY[j], path)
|
57 |
+
bitInfo.append(triplet)
|
58 |
+
return arr, bitInfo
|
59 |
+
|
60 |
+
return arr
|
deepscreen/data/featurizers/fingerprint/pharmPointfp.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Created on Sat Aug 17 16:54:12 2019
|
5 |
+
|
6 |
+
@author: wanxiang.shen@u.nus.edu
|
7 |
+
|
8 |
+
Combining a set of chemical features with the 2D (topological) distances between them gives a 2D pharmacophore. When the distances are binned, unique integer ids can be assigned to each of these pharmacophores and they can be stored in a fingerprint. Details of the encoding are in: https://www.rdkit.org/docs/RDKit_Book.html#ph4-figure
|
9 |
+
"""
|
10 |
+
|
11 |
+
_type = 'Pharmacophore-based'
|
12 |
+
|
13 |
+
from rdkit.Chem.Pharm2D.SigFactory import SigFactory
|
14 |
+
from rdkit.Chem.Pharm2D import Generate
|
15 |
+
from rdkit.Chem import DataStructs
|
16 |
+
from rdkit.Chem import ChemicalFeatures
|
17 |
+
|
18 |
+
import numpy as np
|
19 |
+
import os
|
20 |
+
|
21 |
+
fdef = os.path.join(os.path.dirname(__file__), 'mnimalfatures.fdef')
|
22 |
+
featFactory = ChemicalFeatures.BuildFeatureFactory(fdef)
|
23 |
+
|
24 |
+
|
25 |
+
def GetPharmacoPFPs(mol,
|
26 |
+
bins=[(i, i + 1) for i in range(20)],
|
27 |
+
minPointCount=2,
|
28 |
+
maxPointCount=2,
|
29 |
+
return_bitInfo=False):
|
30 |
+
'''
|
31 |
+
Note: maxPointCont with 3 is slowly
|
32 |
+
|
33 |
+
bins = [(i,i+1) for i in range(20)],
|
34 |
+
maxPonitCount=2 for large-scale computation
|
35 |
+
|
36 |
+
'''
|
37 |
+
MysigFactory = SigFactory(featFactory,
|
38 |
+
trianglePruneBins=False,
|
39 |
+
minPointCount=minPointCount,
|
40 |
+
maxPointCount=maxPointCount)
|
41 |
+
MysigFactory.SetBins(bins)
|
42 |
+
MysigFactory.Init()
|
43 |
+
|
44 |
+
res = Generate.Gen2DFingerprint(mol, MysigFactory)
|
45 |
+
arr = np.array(list(res)).astype(np.bool_)
|
46 |
+
if return_bitInfo:
|
47 |
+
description = []
|
48 |
+
for i in range(len(res)):
|
49 |
+
description.append(MysigFactory.GetBitDescription(i))
|
50 |
+
return arr, description
|
51 |
+
|
52 |
+
return arr
|
53 |
+
|
54 |
+
|
55 |
+
if __name__ == '__main__':
|
56 |
+
from rdkit import Chem
|
57 |
+
|
58 |
+
mol = Chem.MolFromSmiles('CC#CC(=O)NC1=NC=C2C(=C1)C(=NC=N2)NC3=CC(=C(C=C3)F)Cl')
|
59 |
+
a = GetPharmacoPFPs(mol, bins=[(i, i + 1) for i in range(20)], minPointCount=2, maxPointCount=2)
|
deepscreen/data/featurizers/fingerprint/pubchemfp.py
ADDED
@@ -0,0 +1,1731 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Created on Sun Aug 25 20:29:36 2019
|
5 |
+
|
6 |
+
@author: charleshen
|
7 |
+
|
8 |
+
@Note: The code are copyed from PyBioMed, with a minor repair
|
9 |
+
|
10 |
+
https://www.ncbi.nlm.nih.gov/pubmed/29556758
|
11 |
+
|
12 |
+
these are SMARTS patterns corresponding to the PubChem fingerprints
|
13 |
+
https://astro.temple.edu/~tua87106/list_fingerprints.pdf
|
14 |
+
ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.txt
|
15 |
+
|
16 |
+
"""
|
17 |
+
|
18 |
+
_type = 'SMARTS-based'
|
19 |
+
|
20 |
+
import numpy as np
|
21 |
+
from rdkit import Chem
|
22 |
+
from rdkit import DataStructs
|
23 |
+
import os
|
24 |
+
import pandas as pd
|
25 |
+
|
26 |
+
smartsPatts = {
|
27 |
+
1: ('[H]', 3), # 1-115
|
28 |
+
2: ('[H]', 7),
|
29 |
+
3: ('[H]', 15),
|
30 |
+
4: ('[H]', 31),
|
31 |
+
5: ('[Li]', 0),
|
32 |
+
6: ('[Li]', 1),
|
33 |
+
7: ('[B]', 0),
|
34 |
+
8: ('[B]', 1),
|
35 |
+
9: ('[B]', 3),
|
36 |
+
10: ('[C]', 1),
|
37 |
+
11: ('[C]', 3),
|
38 |
+
12: ('[C]', 7),
|
39 |
+
13: ('[C]', 15),
|
40 |
+
14: ('[C]', 31),
|
41 |
+
15: ('[N]', 0),
|
42 |
+
16: ('[N]', 1),
|
43 |
+
17: ('[N]', 3),
|
44 |
+
18: ('[N]', 7),
|
45 |
+
19: ('[O]', 0),
|
46 |
+
20: ('[O]', 1),
|
47 |
+
21: ('[O]', 3),
|
48 |
+
22: ('[O]', 7),
|
49 |
+
23: ('[O]', 15),
|
50 |
+
24: ('[F]', 0),
|
51 |
+
25: ('[F]', 1),
|
52 |
+
26: ('[F]', 3),
|
53 |
+
27: ('[Na]', 0),
|
54 |
+
28: ('[Na]', 1),
|
55 |
+
29: ('[Si]', 0),
|
56 |
+
30: ('[Si]', 1),
|
57 |
+
31: ('[P]', 0),
|
58 |
+
32: ('[P]', 1),
|
59 |
+
33: ('[P]', 3),
|
60 |
+
34: ('[S]', 0),
|
61 |
+
35: ('[S]', 1),
|
62 |
+
36: ('[S]', 3),
|
63 |
+
37: ('[S]', 7),
|
64 |
+
38: ('[Cl]', 0),
|
65 |
+
39: ('[Cl]', 1),
|
66 |
+
40: ('[Cl]', 3),
|
67 |
+
41: ('[Cl]', 7),
|
68 |
+
42: ('[K]', 0),
|
69 |
+
43: ('[K]', 1),
|
70 |
+
44: ('[Br]', 0),
|
71 |
+
45: ('[Br]', 1),
|
72 |
+
46: ('[Br]', 3),
|
73 |
+
47: ('[I]', 0),
|
74 |
+
48: ('[I]', 1),
|
75 |
+
49: ('[I]', 3),
|
76 |
+
50: ('[Be]', 0),
|
77 |
+
51: ('[Mg]', 0),
|
78 |
+
52: ('[Al]', 0),
|
79 |
+
53: ('[Ca]', 0),
|
80 |
+
54: ('[Sc]', 0),
|
81 |
+
55: ('[Ti]', 0),
|
82 |
+
56: ('[V]', 0),
|
83 |
+
57: ('[Cr]', 0),
|
84 |
+
58: ('[Mn]', 0),
|
85 |
+
59: ('[Fe]', 0),
|
86 |
+
60: ('[CO]', 0),
|
87 |
+
61: ('[Ni]', 0),
|
88 |
+
62: ('[Cu]', 0),
|
89 |
+
63: ('[Zn]', 0),
|
90 |
+
64: ('[Ga]', 0),
|
91 |
+
65: ('[Ge]', 0),
|
92 |
+
66: ('[As]', 0),
|
93 |
+
67: ('[Se]', 0),
|
94 |
+
68: ('[Kr]', 0),
|
95 |
+
69: ('[Rb]', 0),
|
96 |
+
70: ('[Sr]', 0),
|
97 |
+
71: ('[Y]', 0),
|
98 |
+
72: ('[Zr]', 0),
|
99 |
+
73: ('[Nb]', 0),
|
100 |
+
74: ('[Mo]', 0),
|
101 |
+
75: ('[Ru]', 0),
|
102 |
+
76: ('[Rh]', 0),
|
103 |
+
77: ('[Pd]', 0),
|
104 |
+
78: ('[Ag]', 0),
|
105 |
+
79: ('[Cd]', 0),
|
106 |
+
80: ('[In]', 0),
|
107 |
+
81: ('[Sn]', 0),
|
108 |
+
82: ('[Sb]', 0),
|
109 |
+
83: ('[Te]', 0),
|
110 |
+
84: ('[Xe]', 0),
|
111 |
+
85: ('[Cs]', 0),
|
112 |
+
86: ('[Ba]', 0),
|
113 |
+
87: ('[Lu]', 0),
|
114 |
+
88: ('[Hf]', 0),
|
115 |
+
89: ('[Ta]', 0),
|
116 |
+
90: ('[W]', 0),
|
117 |
+
91: ('[Re]', 0),
|
118 |
+
92: ('[Os]', 0),
|
119 |
+
93: ('[Ir]', 0),
|
120 |
+
94: ('[Pt]', 0),
|
121 |
+
95: ('[Au]', 0),
|
122 |
+
96: ('[Hg]', 0),
|
123 |
+
97: ('[Tl]', 0),
|
124 |
+
98: ('[Pb]', 0),
|
125 |
+
99: ('[Bi]', 0),
|
126 |
+
100: ('[La]', 0),
|
127 |
+
101: ('[Ce]', 0),
|
128 |
+
102: ('[Pr]', 0),
|
129 |
+
103: ('[Nd]', 0),
|
130 |
+
104: ('[Pm]', 0),
|
131 |
+
105: ('[Sm]', 0),
|
132 |
+
106: ('[Eu]', 0),
|
133 |
+
107: ('[Gd]', 0),
|
134 |
+
108: ('[Tb]', 0),
|
135 |
+
109: ('[Dy]', 0),
|
136 |
+
110: ('[Ho]', 0),
|
137 |
+
111: ('[Er]', 0),
|
138 |
+
112: ('[Tm]', 0),
|
139 |
+
113: ('[Yb]', 0),
|
140 |
+
114: ('[Tc]', 0),
|
141 |
+
115: ('[U]', 0),
|
142 |
+
116: ('[Li&!H0]', 0), # 264-881
|
143 |
+
117: ('[Li]~[Li]', 0),
|
144 |
+
118: ('[Li]~[#5]', 0),
|
145 |
+
119: ('[Li]~[#6]', 0),
|
146 |
+
120: ('[Li]~[#8]', 0),
|
147 |
+
121: ('[Li]~[F]', 0),
|
148 |
+
122: ('[Li]~[#15]', 0),
|
149 |
+
123: ('[Li]~[#16]', 0),
|
150 |
+
124: ('[Li]~[Cl]', 0),
|
151 |
+
125: ('[#5&!H0]', 0),
|
152 |
+
126: ('[#5]~[#5]', 0),
|
153 |
+
127: ('[#5]~[#6]', 0),
|
154 |
+
128: ('[#5]~[#7]', 0),
|
155 |
+
129: ('[#5]~[#8]', 0),
|
156 |
+
130: ('[#5]~[F]', 0),
|
157 |
+
131: ('[#5]~[#14]', 0),
|
158 |
+
132: ('[#5]~[#15]', 0),
|
159 |
+
133: ('[#5]~[#16]', 0),
|
160 |
+
134: ('[#5]~[Cl]', 0),
|
161 |
+
135: ('[#5]~[Br]', 0),
|
162 |
+
136: ('[#6&!H0]', 0),
|
163 |
+
137: ('[#6]~[#6]', 0),
|
164 |
+
138: ('[#6]~[#7]', 0),
|
165 |
+
139: ('[#6]~[#8]', 0),
|
166 |
+
140: ('[#6]~[F]', 0),
|
167 |
+
141: ('[#6]~[Na]', 0),
|
168 |
+
142: ('[#6]~[Mg]', 0),
|
169 |
+
143: ('[#6]~[Al]', 0),
|
170 |
+
144: ('[#6]~[#14]', 0),
|
171 |
+
145: ('[#6]~[#15]', 0),
|
172 |
+
146: ('[#6]~[#16]', 0),
|
173 |
+
147: ('[#6]~[Cl]', 0),
|
174 |
+
148: ('[#6]~[#33]', 0),
|
175 |
+
149: ('[#6]~[#34]', 0),
|
176 |
+
150: ('[#6]~[Br]', 0),
|
177 |
+
151: ('[#6]~[I]', 0),
|
178 |
+
152: ('[#7&!H0]', 0),
|
179 |
+
153: ('[#7]~[#7]', 0),
|
180 |
+
154: ('[#7]~[#8]', 0),
|
181 |
+
155: ('[#7]~[F]', 0),
|
182 |
+
156: ('[#7]~[#14]', 0),
|
183 |
+
157: ('[#7]~[#15]', 0),
|
184 |
+
158: ('[#7]~[#16]', 0),
|
185 |
+
159: ('[#7]~[Cl]', 0),
|
186 |
+
160: ('[#7]~[Br]', 0),
|
187 |
+
161: ('[#8&!H0]', 0),
|
188 |
+
162: ('[#8]~[#8]', 0),
|
189 |
+
163: ('[#8]~[Mg]', 0),
|
190 |
+
164: ('[#8]~[Na]', 0),
|
191 |
+
165: ('[#8]~[Al]', 0),
|
192 |
+
166: ('[#8]~[#14]', 0),
|
193 |
+
167: ('[#8]~[#15]', 0),
|
194 |
+
168: ('[#8]~[K]', 0),
|
195 |
+
169: ('[F]~[#15]', 0),
|
196 |
+
170: ('[F]~[#16]', 0),
|
197 |
+
171: ('[Al&!H0]', 0),
|
198 |
+
172: ('[Al]~[Cl]', 0),
|
199 |
+
173: ('[#14&!H0]', 0),
|
200 |
+
174: ('[#14]~[#14]', 0),
|
201 |
+
175: ('[#14]~[Cl]', 0),
|
202 |
+
176: ('[#15&!H0]', 0),
|
203 |
+
177: ('[#15]~[#15]', 0),
|
204 |
+
178: ('[#33&!H0]', 0),
|
205 |
+
179: ('[#33]~[#33]', 0),
|
206 |
+
180: ('[#6](~Br)(~[#6])', 0),
|
207 |
+
181: ('[#6](~Br)(~[#6])(~[#6])', 0),
|
208 |
+
182: ('[#6&!H0]~[Br]', 0),
|
209 |
+
183: ('[#6](~[Br])(:[c])', 0),
|
210 |
+
184: ('[#6](~[Br])(:[n])', 0),
|
211 |
+
185: ('[#6](~[#6])(~[#6])', 0),
|
212 |
+
186: ('[#6](~[#6])(~[#6])(~[#6])', 0),
|
213 |
+
187: ('[#6](~[#6])(~[#6])(~[#6])(~[#6])', 0),
|
214 |
+
188: ('[#6H1](~[#6])(~[#6])(~[#6])', 0),
|
215 |
+
189: ('[#6](~[#6])(~[#6])(~[#6])(~[#7])', 0),
|
216 |
+
190: ('[#6](~[#6])(~[#6])(~[#6])(~[#8])', 0),
|
217 |
+
191: ('[#6H1](~[#6])(~[#6])(~[#7])', 0),
|
218 |
+
192: ('[#6H1](~[#6])(~[#6])(~[#8])', 0),
|
219 |
+
193: ('[#6](~[#6])(~[#6])(~[#7])', 0),
|
220 |
+
194: ('[#6](~[#6])(~[#6])(~[#8])', 0),
|
221 |
+
195: ('[#6](~[#6])(~[Cl])', 0),
|
222 |
+
196: ('[#6&!H0](~[#6])(~[Cl])', 0),
|
223 |
+
197: ('[#6H,#6H2,#6H3,#6H4]~[#6]', 0),
|
224 |
+
198: ('[#6&!H0](~[#6])(~[#7])', 0),
|
225 |
+
199: ('[#6&!H0](~[#6])(~[#8])', 0),
|
226 |
+
200: ('[#6H1](~[#6])(~[#8])(~[#8])', 0),
|
227 |
+
201: ('[#6&!H0](~[#6])(~[#15])', 0),
|
228 |
+
202: ('[#6&!H0](~[#6])(~[#16])', 0),
|
229 |
+
203: ('[#6](~[#6])(~[I])', 0),
|
230 |
+
204: ('[#6](~[#6])(~[#7])', 0),
|
231 |
+
205: ('[#6](~[#6])(~[#8])', 0),
|
232 |
+
206: ('[#6](~[#6])(~[#16])', 0),
|
233 |
+
207: ('[#6](~[#6])(~[#14])', 0),
|
234 |
+
208: ('[#6](~[#6])(:c)', 0),
|
235 |
+
209: ('[#6](~[#6])(:c)(:c)', 0),
|
236 |
+
210: ('[#6](~[#6])(:c)(:n)', 0),
|
237 |
+
211: ('[#6](~[#6])(:n)', 0),
|
238 |
+
212: ('[#6](~[#6])(:n)(:n)', 0),
|
239 |
+
213: ('[#6](~[Cl])(~[Cl])', 0),
|
240 |
+
214: ('[#6&!H0](~[Cl])', 0),
|
241 |
+
215: ('[#6](~[Cl])(:c)', 0),
|
242 |
+
216: ('[#6](~[F])(~[F])', 0),
|
243 |
+
217: ('[#6](~[F])(:c)', 0),
|
244 |
+
218: ('[#6&!H0](~[#7])', 0),
|
245 |
+
219: ('[#6&!H0](~[#8])', 0),
|
246 |
+
220: ('[#6&!H0](~[#8])(~[#8])', 0),
|
247 |
+
221: ('[#6&!H0](~[#16])', 0),
|
248 |
+
222: ('[#6&!H0](~[#14])', 0),
|
249 |
+
223: ('[#6&!H0]:c', 0),
|
250 |
+
224: ('[#6&!H0](:c)(:c)', 0),
|
251 |
+
225: ('[#6&!H0](:c)(:n)', 0),
|
252 |
+
226: ('[#6&!H0](:n)', 0),
|
253 |
+
227: ('[#6H3]', 0),
|
254 |
+
228: ('[#6](~[#7])(~[#7])', 0),
|
255 |
+
229: ('[#6](~[#7])(:c)', 0),
|
256 |
+
230: ('[#6](~[#7])(:c)(:c)', 0),
|
257 |
+
231: ('[#6](~[#7])(:c)(:n)', 0),
|
258 |
+
232: ('[#6](~[#7])(:n)', 0),
|
259 |
+
233: ('[#6](~[#8])(~[#8])', 0),
|
260 |
+
234: ('[#6](~[#8])(:c)', 0),
|
261 |
+
235: ('[#6](~[#8])(:c)(:c)', 0),
|
262 |
+
236: ('[#6](~[#16])(:c)', 0),
|
263 |
+
237: ('[#6](:c)(:c)', 0),
|
264 |
+
238: ('[#6](:c)(:c)(:c)', 0),
|
265 |
+
239: ('[#6](:c)(:c)(:n)', 0),
|
266 |
+
240: ('[#6](:c)(:n)', 0),
|
267 |
+
241: ('[#6](:c)(:n)(:n)', 0),
|
268 |
+
242: ('[#6](:n)(:n)', 0),
|
269 |
+
243: ('[#7](~[#6])(~[#6])', 0),
|
270 |
+
244: ('[#7](~[#6])(~[#6])(~[#6])', 0),
|
271 |
+
245: ('[#7&!H0](~[#6])(~[#6])', 0),
|
272 |
+
246: ('[#7&!H0](~[#6])', 0),
|
273 |
+
247: ('[#7&!H0](~[#6])(~[#7])', 0),
|
274 |
+
248: ('[#7](~[#6])(~[#8])', 0),
|
275 |
+
249: ('[#7](~[#6])(:c)', 0),
|
276 |
+
250: ('[#7](~[#6])(:c)(:c)', 0),
|
277 |
+
251: ('[#7&!H0](~[#7])', 0),
|
278 |
+
252: ('[#7&!H0](:c)', 0),
|
279 |
+
253: ('[#7&!H0](:c)(:c)', 0),
|
280 |
+
254: ('[#7](~[#8])(~[#8])', 0),
|
281 |
+
255: ('[#7](~[#8])(:o)', 0),
|
282 |
+
256: ('[#7](:c)(:c)', 0),
|
283 |
+
257: ('[#7](:c)(:c)(:c)', 0),
|
284 |
+
258: ('[#8](~[#6])(~[#6])', 0),
|
285 |
+
259: ('[#8&!H0](~[#6])', 0),
|
286 |
+
260: ('[#8](~[#6])(~[#15])', 0),
|
287 |
+
261: ('[#8&!H0](~[#16])', 0),
|
288 |
+
262: ('[#8](:c)(:c)', 0),
|
289 |
+
263: ('[#15](~[#6])(~[#6])', 0),
|
290 |
+
264: ('[#15](~[#8])(~[#8])', 0),
|
291 |
+
265: ('[#16](~[#6])(~[#6])', 0),
|
292 |
+
266: ('[#16&!H0](~[#6])', 0),
|
293 |
+
267: ('[#16](~[#6])(~[#8])', 0),
|
294 |
+
268: ('[#14](~[#6])(~[#6])', 0),
|
295 |
+
269: ('[#6]=,:[#6]', 0),
|
296 |
+
270: ('[#6]#[#6]', 0),
|
297 |
+
271: ('[#6]=,:[#7]', 0),
|
298 |
+
272: ('[#6]#[#7]', 0),
|
299 |
+
273: ('[#6]=,:[#8]', 0),
|
300 |
+
274: ('[#6]=,:[#16]', 0),
|
301 |
+
275: ('[#7]=,:[#7]', 0),
|
302 |
+
276: ('[#7]=,:[#8]', 0),
|
303 |
+
277: ('[#7]=,:[#15]', 0),
|
304 |
+
278: ('[#15]=,:[#8]', 0),
|
305 |
+
279: ('[#15]=,:[#15]', 0),
|
306 |
+
280: ('[#6](#[#6])(-,:[#6])', 0),
|
307 |
+
281: ('[#6&!H0](#[#6])', 0),
|
308 |
+
282: ('[#6](#[#7])(-,:[#6])', 0),
|
309 |
+
283: ('[#6](-,:[#6])(-,:[#6])(=,:[#6])', 0),
|
310 |
+
284: ('[#6](-,:[#6])(-,:[#6])(=,:[#7])', 0),
|
311 |
+
285: ('[#6](-,:[#6])(-,:[#6])(=,:[#8])', 0),
|
312 |
+
286: ('[#6](-,:[#6])([Cl])(=,:[#8])', 0),
|
313 |
+
287: ('[#6&!H0](-,:[#6])(=,:[#6])', 0),
|
314 |
+
288: ('[#6&!H0](-,:[#6])(=,:[#7])', 0),
|
315 |
+
289: ('[#6&!H0](-,:[#6])(=,:[#8])', 0),
|
316 |
+
290: ('[#6](-,:[#6])(-,:[#7])(=,:[#6])', 0),
|
317 |
+
291: ('[#6](-,:[#6])(-,:[#7])(=,:[#7])', 0),
|
318 |
+
292: ('[#6](-,:[#6])(-,:[#7])(=,:[#8])', 0),
|
319 |
+
293: ('[#6](-,:[#6])(-,:[#8])(=,:[#8])', 0),
|
320 |
+
294: ('[#6](-,:[#6])(=,:[#6])', 0),
|
321 |
+
295: ('[#6](-,:[#6])(=,:[#7])', 0),
|
322 |
+
296: ('[#6](-,:[#6])(=,:[#8])', 0),
|
323 |
+
297: ('[#6]([Cl])(=,:[#8])', 0),
|
324 |
+
298: ('[#6&!H0](-,:[#7])(=,:[#6])', 0),
|
325 |
+
299: ('[#6&!H0](=,:[#6])', 0),
|
326 |
+
300: ('[#6&!H0](=,:[#7])', 0),
|
327 |
+
301: ('[#6&!H0](=,:[#8])', 0),
|
328 |
+
302: ('[#6](-,:[#7])(=,:[#6])', 0),
|
329 |
+
303: ('[#6](-,:[#7])(=,:[#7])', 0),
|
330 |
+
304: ('[#6](-,:[#7])(=,:[#8])', 0),
|
331 |
+
305: ('[#6](-,:[#8])(=,:[#8])', 0),
|
332 |
+
306: ('[#7](-,:[#6])(=,:[#6])', 0),
|
333 |
+
307: ('[#7](-,:[#6])(=,:[#8])', 0),
|
334 |
+
308: ('[#7](-,:[#8])(=,:[#8])', 0),
|
335 |
+
309: ('[#15](-,:[#8])(=,:[#8])', 0),
|
336 |
+
310: ('[#16](-,:[#6])(=,:[#8])', 0),
|
337 |
+
311: ('[#16](-,:[#8])(=,:[#8])', 0),
|
338 |
+
312: ('[#16](=,:[#8])(=,:[#8])', 0),
|
339 |
+
313: ('[#6]-,:[#6]-,:[#6]#[#6]', 0),
|
340 |
+
314: ('[#8]-,:[#6]-,:[#6]=,:[#7]', 0),
|
341 |
+
315: ('[#8]-,:[#6]-,:[#6]=,:[#8]', 0),
|
342 |
+
316: ('[#7]:[#6]-,:[#16&!H0]', 0),
|
343 |
+
317: ('[#7]-,:[#6]-,:[#6]=,:[#6]', 0),
|
344 |
+
318: ('[#8]=,:[#16]-,:[#6]-,:[#6]', 0),
|
345 |
+
319: ('[#7]#[#6]-,:[#6]=,:[#6]', 0),
|
346 |
+
320: ('[#6]=,:[#7]-,:[#7]-,:[#6]', 0),
|
347 |
+
321: ('[#8]=,:[#16]-,:[#6]-,:[#7]', 0),
|
348 |
+
322: ('[#16]-,:[#16]-,:[#6]:[#6]', 0),
|
349 |
+
323: ('[#6]:[#6]-,:[#6]=,:[#6]', 0),
|
350 |
+
324: ('[#16]:[#6]:[#6]:[#6]', 0),
|
351 |
+
325: ('[#6]:[#7]:[#6]-,:[#6]', 0),
|
352 |
+
326: ('[#16]-,:[#6]:[#7]:[#6]', 0),
|
353 |
+
327: ('[#16]:[#6]:[#6]:[#7]', 0),
|
354 |
+
328: ('[#16]-,:[#6]=,:[#7]-,:[#6]', 0),
|
355 |
+
329: ('[#6]-,:[#8]-,:[#6]=,:[#6]', 0),
|
356 |
+
330: ('[#7]-,:[#7]-,:[#6]:[#6]', 0),
|
357 |
+
331: ('[#16]-,:[#6]=,:[#7&!H0]', 0),
|
358 |
+
332: ('[#16]-,:[#6]-,:[#16]-,:[#6]', 0),
|
359 |
+
333: ('[#6]:[#16]:[#6]-,:[#6]', 0),
|
360 |
+
334: ('[#8]-,:[#16]-,:[#6]:[#6]', 0),
|
361 |
+
335: ('[#6]:[#7]-,:[#6]:[#6]', 0),
|
362 |
+
336: ('[#7]-,:[#16]-,:[#6]:[#6]', 0),
|
363 |
+
337: ('[#7]-,:[#6]:[#7]:[#6]', 0),
|
364 |
+
338: ('[#7]:[#6]:[#6]:[#7]', 0),
|
365 |
+
339: ('[#7]-,:[#6]:[#7]:[#7]', 0),
|
366 |
+
340: ('[#7]-,:[#6]=,:[#7]-,:[#6]', 0),
|
367 |
+
341: ('[#7]-,:[#6]=,:[#7&!H0]', 0),
|
368 |
+
342: ('[#7]-,:[#6]-,:[#16]-,:[#6]', 0),
|
369 |
+
343: ('[#6]-,:[#6]-,:[#6]=,:[#6]', 0),
|
370 |
+
344: ('[#6]-,:[#7]:[#6&!H0]', 0),
|
371 |
+
345: ('[#7]-,:[#6]:[#8]:[#6]', 0),
|
372 |
+
346: ('[#8]=,:[#6]-,:[#6]:[#6]', 0),
|
373 |
+
347: ('[#8]=,:[#6]-,:[#6]:[#7]', 0),
|
374 |
+
348: ('[#6]-,:[#7]-,:[#6]:[#6]', 0),
|
375 |
+
349: ('[#7]:[#7]-,:[#6&!H0]', 0),
|
376 |
+
350: ('[#8]-,:[#6]:[#6]:[#7]', 0),
|
377 |
+
351: ('[#8]-,:[#6]=,:[#6]-,:[#6]', 0),
|
378 |
+
352: ('[#7]-,:[#6]:[#6]:[#7]', 0),
|
379 |
+
353: ('[#6]-,:[#16]-,:[#6]:[#6]', 0),
|
380 |
+
354: ('[Cl]-,:[#6]:[#6]-,:[#6]', 0),
|
381 |
+
355: ('[#7]-,:[#6]=,:[#6&!H0]', 0),
|
382 |
+
356: ('[Cl]-,:[#6]:[#6&!H0]', 0),
|
383 |
+
357: ('[#7]:[#6]:[#7]-,:[#6]', 0),
|
384 |
+
358: ('[Cl]-,:[#6]:[#6]-,:[#8]', 0),
|
385 |
+
359: ('[#6]-,:[#6]:[#7]:[#6]', 0),
|
386 |
+
360: ('[#6]-,:[#6]-,:[#16]-,:[#6]', 0),
|
387 |
+
361: ('[#16]=,:[#6]-,:[#7]-,:[#6]', 0),
|
388 |
+
362: ('[Br]-,:[#6]:[#6]-,:[#6]', 0),
|
389 |
+
363: ('[#7&!H0]-,:[#7&!H0]', 0),
|
390 |
+
364: ('[#16]=,:[#6]-,:[#7&!H0]', 0),
|
391 |
+
365: ('[#6]-,:[#33]-[#8&!H0]', 0),
|
392 |
+
366: ('[#16]:[#6]:[#6&!H0]', 0),
|
393 |
+
367: ('[#8]-,:[#7]-,:[#6]-,:[#6]', 0),
|
394 |
+
368: ('[#7]-,:[#7]-,:[#6]-,:[#6]', 0),
|
395 |
+
369: ('[#6H,#6H2,#6H3]=,:[#6H,#6H2,#6H3]', 0),
|
396 |
+
370: ('[#7]-,:[#7]-,:[#6]-,:[#7]', 0),
|
397 |
+
371: ('[#8]=,:[#6]-,:[#7]-,:[#7]', 0),
|
398 |
+
372: ('[#7]=,:[#6]-,:[#7]-,:[#6]', 0),
|
399 |
+
373: ('[#6]=,:[#6]-,:[#6]:[#6]', 0),
|
400 |
+
374: ('[#6]:[#7]-,:[#6&!H0]', 0),
|
401 |
+
375: ('[#6]-,:[#7]-,:[#7&!H0]', 0),
|
402 |
+
376: ('[#7]:[#6]:[#6]-,:[#6]', 0),
|
403 |
+
377: ('[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
|
404 |
+
378: ('[#33]-,:[#6]:[#6&!H0]', 0),
|
405 |
+
379: ('[Cl]-,:[#6]:[#6]-,:[Cl]', 0),
|
406 |
+
380: ('[#6]:[#6]:[#7&!H0]', 0),
|
407 |
+
381: ('[#7&!H0]-,:[#6&!H0]', 0),
|
408 |
+
382: ('[Cl]-,:[#6]-,:[#6]-,:[Cl]', 0),
|
409 |
+
383: ('[#7]:[#6]-,:[#6]:[#6]', 0),
|
410 |
+
384: ('[#16]-,:[#6]:[#6]-,:[#6]', 0),
|
411 |
+
385: ('[#16]-,:[#6]:[#6&!H0]', 0),
|
412 |
+
386: ('[#16]-,:[#6]:[#6]-,:[#7]', 0),
|
413 |
+
387: ('[#16]-,:[#6]:[#6]-,:[#8]', 0),
|
414 |
+
388: ('[#8]=,:[#6]-,:[#6]-,:[#6]', 0),
|
415 |
+
389: ('[#8]=,:[#6]-,:[#6]-,:[#7]', 0),
|
416 |
+
390: ('[#8]=,:[#6]-,:[#6]-,:[#8]', 0),
|
417 |
+
391: ('[#7]=,:[#6]-,:[#6]-,:[#6]', 0),
|
418 |
+
392: ('[#7]=,:[#6]-,:[#6&!H0]', 0),
|
419 |
+
393: ('[#6]-,:[#7]-,:[#6&!H0]', 0),
|
420 |
+
394: ('[#8]-,:[#6]:[#6]-,:[#6]', 0),
|
421 |
+
395: ('[#8]-,:[#6]:[#6&!H0]', 0),
|
422 |
+
396: ('[#8]-,:[#6]:[#6]-,:[#7]', 0),
|
423 |
+
397: ('[#8]-,:[#6]:[#6]-,:[#8]', 0),
|
424 |
+
398: ('[#7]-,:[#6]:[#6]-,:[#6]', 0),
|
425 |
+
399: ('[#7]-,:[#6]:[#6&!H0]', 0),
|
426 |
+
400: ('[#7]-,:[#6]:[#6]-,:[#7]', 0),
|
427 |
+
401: ('[#8]-,:[#6]-,:[#6]:[#6]', 0),
|
428 |
+
402: ('[#7]-,:[#6]-,:[#6]:[#6]', 0),
|
429 |
+
403: ('[Cl]-,:[#6]-,:[#6]-,:[#6]', 0),
|
430 |
+
404: ('[Cl]-,:[#6]-,:[#6]-,:[#8]', 0),
|
431 |
+
405: ('[#6]:[#6]-,:[#6]:[#6]', 0),
|
432 |
+
406: ('[#8]=,:[#6]-,:[#6]=,:[#6]', 0),
|
433 |
+
407: ('[Br]-,:[#6]-,:[#6]-,:[#6]', 0),
|
434 |
+
408: ('[#7]=,:[#6]-,:[#6]=,:[#6]', 0),
|
435 |
+
409: ('[#6]=,:[#6]-,:[#6]-,:[#6]', 0),
|
436 |
+
410: ('[#7]:[#6]-,:[#8&!H0]', 0),
|
437 |
+
411: ('[#8]=,:[#7]-,:c:c', 0),
|
438 |
+
412: ('[#8]-,:[#6]-,:[#7&!H0]', 0),
|
439 |
+
413: ('[#7]-,:[#6]-,:[#7]-,:[#6]', 0),
|
440 |
+
414: ('[Cl]-,:[#6]-,:[#6]=,:[#8]', 0),
|
441 |
+
415: ('[Br]-,:[#6]-,:[#6]=,:[#8]', 0),
|
442 |
+
416: ('[#8]-,:[#6]-,:[#8]-,:[#6]', 0),
|
443 |
+
417: ('[#6]=,:[#6]-,:[#6]=,:[#6]', 0),
|
444 |
+
418: ('[#6]:[#6]-,:[#8]-,:[#6]', 0),
|
445 |
+
419: ('[#8]-,:[#6]-,:[#6]-,:[#7]', 0),
|
446 |
+
420: ('[#8]-,:[#6]-,:[#6]-,:[#8]', 0),
|
447 |
+
421: ('N#[#6]-,:[#6]-,:[#6]', 0),
|
448 |
+
422: ('[#7]-,:[#6]-,:[#6]-,:[#7]', 0),
|
449 |
+
423: ('[#6]:[#6]-,:[#6]-,:[#6]', 0),
|
450 |
+
424: ('[#6&!H0]-,:[#8&!H0]', 0),
|
451 |
+
425: ('n:c:n:c', 0),
|
452 |
+
426: ('[#8]-,:[#6]-,:[#6]=,:[#6]', 0),
|
453 |
+
427: ('[#8]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
|
454 |
+
428: ('[#8]-,:[#6]-,:[#6]:[#6]-,:[#8]', 0),
|
455 |
+
429: ('[#7]=,:[#6]-,:[#6]:[#6&!H0]', 0),
|
456 |
+
430: ('c:c-,:[#7]-,:c:c', 0),
|
457 |
+
431: ('[#6]-,:[#6]:[#6]-,:c:c', 0),
|
458 |
+
432: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
459 |
+
433: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
460 |
+
434: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
461 |
+
435: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
462 |
+
436: ('[Cl]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
|
463 |
+
437: ('c:c-,:[#6]=,:[#6]-,:[#6]', 0),
|
464 |
+
438: ('[#6]-,:[#6]:[#6]-,:[#7]-,:[#6]', 0),
|
465 |
+
439: ('[#6]-,:[#16]-,:[#6]-,:[#6]-,:[#6]', 0),
|
466 |
+
440: ('[#7]-,:[#6]:[#6]-,:[#8&!H0]', 0),
|
467 |
+
441: ('[#8]=,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
|
468 |
+
442: ('[#6]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
|
469 |
+
443: ('[#6]-,:[#6]:[#6]-,:[#8&!H0]', 0),
|
470 |
+
444: ('[Cl]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
471 |
+
445: ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
472 |
+
446: ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
473 |
+
447: ('[#6]-,:[#8]-,:[#6]-,:[#6]=,:[#6]', 0),
|
474 |
+
448: ('c:c-,:[#6]-,:[#6]-,:[#6]', 0),
|
475 |
+
449: ('[#7]=,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
|
476 |
+
450: ('[#8]=,:[#6]-,:[#6]-,:c:c', 0),
|
477 |
+
451: ('[Cl]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
|
478 |
+
452: ('[#6H,#6H2,#6H3]-,:[#6]=,:[#6H,#6H2,#6H3]', 0),
|
479 |
+
453: ('[#7]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
|
480 |
+
454: ('[#7]-,:[#6]:[#6]:[#6]-,:[#7]', 0),
|
481 |
+
455: ('[#8]=,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
482 |
+
456: ('[#6]-,:c:c:[#6]-,:[#6]', 0),
|
483 |
+
457: ('[#6]-,:[#8]-,:[#6]-,:[#6]:c', 0),
|
484 |
+
458: ('[#8]=,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
485 |
+
459: ('[#8]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
|
486 |
+
460: ('[#7]-,:[#6]-,:[#6]-,:[#6]:c', 0),
|
487 |
+
461: ('[#6]-,:[#6]-,:[#6]-,:[#6]:c', 0),
|
488 |
+
462: ('[Cl]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
489 |
+
463: ('[#6]-,:[#8]-,:[#6]-,:[#8]-,:[#6]', 0),
|
490 |
+
464: ('[#7]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
491 |
+
465: ('[#7]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
|
492 |
+
466: ('[#6]-,:[#7]-,:[#6]-,:[#6]-,:[#6]', 0),
|
493 |
+
467: ('[#6]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
|
494 |
+
468: ('[#7]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
495 |
+
469: ('c:c:n:n:c', 0),
|
496 |
+
470: ('[#6]-,:[#6]-,:[#6]-,:[#8&!H0]', 0),
|
497 |
+
471: ('c:[#6]-,:[#6]-,:[#6]:c', 0),
|
498 |
+
472: ('[#8]-,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
|
499 |
+
473: ('c:c-,:[#8]-,:[#6]-,:[#6]', 0),
|
500 |
+
474: ('[#7]-,:[#6]:c:c:n', 0),
|
501 |
+
475: ('[#8]=,:[#6]-,:[#8]-,:[#6]:c', 0),
|
502 |
+
476: ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
|
503 |
+
477: ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#7]', 0),
|
504 |
+
478: ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#8]', 0),
|
505 |
+
479: ('[#6]-,:[#8]-,:[#6]:[#6]-,:[#6]', 0),
|
506 |
+
480: ('[#8]=,:[#33]-,:[#6]:c:c', 0),
|
507 |
+
481: ('[#6]-,:[#7]-,:[#6]-,:[#6]:c', 0),
|
508 |
+
482: ('[#16]-,:[#6]:c:c-,:[#7]', 0),
|
509 |
+
483: ('[#8]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
|
510 |
+
484: ('[#8]-,:[#6]:[#6]-,:[#8&!H0]', 0),
|
511 |
+
485: ('[#6]-,:[#6]-,:[#8]-,:[#6]:c', 0),
|
512 |
+
486: ('[#7]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
|
513 |
+
487: ('[#6]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
|
514 |
+
488: ('[#7]-,:[#7]-,:[#6]-,:[#7&!H0]', 0),
|
515 |
+
489: ('[#6]-,:[#7]-,:[#6]-,:[#7]-,:[#6]', 0),
|
516 |
+
490: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
517 |
+
491: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
518 |
+
492: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
519 |
+
493: ('[#6]=,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
520 |
+
494: ('[#8]-,:[#6]-,:[#6]-,:[#6]=,:[#6]', 0),
|
521 |
+
495: ('[#8]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
|
522 |
+
496: ('[#6&!H0]-,:[#6]-,:[#7&!H0]', 0),
|
523 |
+
497: ('[#6]-,:[#6]=,:[#7]-,:[#7]-,:[#6]', 0),
|
524 |
+
498: ('[#8]=,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
|
525 |
+
499: ('[#8]=,:[#6]-,:[#7]-,:[#6&!H0]', 0),
|
526 |
+
500: ('[#8]=,:[#6]-,:[#7]-,:[#6]-,:[#7]', 0),
|
527 |
+
501: ('[#8]=,:[#7]-,:[#6]:[#6]-,:[#7]', 0),
|
528 |
+
502: ('[#8]=,:[#7]-,:c:c-,:[#8]', 0),
|
529 |
+
503: ('[#8]=,:[#6]-,:[#7]-,:[#6]=,:[#8]', 0),
|
530 |
+
504: ('[#8]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
|
531 |
+
505: ('[#8]-,:[#6]:[#6]:[#6]-,:[#7]', 0),
|
532 |
+
506: ('[#8]-,:[#6]:[#6]:[#6]-,:[#8]', 0),
|
533 |
+
507: ('[#7]-,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
|
534 |
+
508: ('[#8]-,:[#6]-,:[#6]-,:[#6]:c', 0),
|
535 |
+
509: ('[#6]-,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
|
536 |
+
510: ('[#6]-,:[#7]-,:[#6]:[#6]-,:[#6]', 0),
|
537 |
+
511: ('[#6]-,:[#6]-,:[#16]-,:[#6]-,:[#6]', 0),
|
538 |
+
512: ('[#8]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
539 |
+
513: ('[#6]-,:[#6]=,:[#6]-,:[#6]-,:[#6]', 0),
|
540 |
+
514: ('[#8]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
|
541 |
+
515: ('[#8]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
542 |
+
516: ('[#8]-,:[#6]-,:[#6]-,:[#8&!H0]', 0),
|
543 |
+
517: ('[#6]-,:[#6]=,:[#6]-,:[#6]=,:[#6]', 0),
|
544 |
+
518: ('[#7]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
|
545 |
+
519: ('[#6]=,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
546 |
+
520: ('[#6]=,:[#6]-,:[#6]-,:[#8&!H0]', 0),
|
547 |
+
521: ('[#6]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
|
548 |
+
522: ('[Cl]-,:[#6]:[#6]-,:[#6]=,:[#8]', 0),
|
549 |
+
523: ('[Br]-,:[#6]:c:c-,:[#6]', 0),
|
550 |
+
524: ('[#8]=,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
|
551 |
+
525: ('[#8]=,:[#6]-,:[#6]=,:[#6&!H0]', 0),
|
552 |
+
526: ('[#8]=,:[#6]-,:[#6]=,:[#6]-,:[#7]', 0),
|
553 |
+
527: ('[#7]-,:[#6]-,:[#7]-,:[#6]:c', 0),
|
554 |
+
528: ('[Br]-,:[#6]-,:[#6]-,:[#6]:c', 0),
|
555 |
+
529: ('[#7]#[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
556 |
+
530: ('[#6]-,:[#6]=,:[#6]-,:[#6]:c', 0),
|
557 |
+
531: ('[#6]-,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
|
558 |
+
532: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
559 |
+
533: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
560 |
+
534: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
561 |
+
535: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
562 |
+
536: ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
563 |
+
537: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
564 |
+
538: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
565 |
+
539: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
566 |
+
540: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
|
567 |
+
541: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
568 |
+
542: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
569 |
+
543: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
570 |
+
544: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
571 |
+
545: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
572 |
+
546: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
573 |
+
547: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
|
574 |
+
548: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
575 |
+
549: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
576 |
+
550: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]', 0),
|
577 |
+
551: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
578 |
+
552: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]', 0),
|
579 |
+
553: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
580 |
+
554: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#8])-,:[#6]', 0),
|
581 |
+
555: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
582 |
+
556: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#7])-,:[#6]', 0),
|
583 |
+
557: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
584 |
+
558: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#8])-,:[#6]', 0),
|
585 |
+
559: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](=,:[#8])-,:[#6]', 0),
|
586 |
+
560: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#7])-,:[#6]', 0),
|
587 |
+
561: ('[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]', 0),
|
588 |
+
562: ('[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]-,:[#6]', 0),
|
589 |
+
563: ('[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]', 0),
|
590 |
+
564: ('[#6]-,:[#6](-,:[#6])(-,:[#6])-,:[#6]-,:[#6]', 0),
|
591 |
+
565: ('[#6]-,:[#6](-,:[#6])-,:[#6](-,:[#6])-,:[#6]', 0),
|
592 |
+
566: ('[#6]c1ccc([#6])cc1', 0),
|
593 |
+
567: ('[#6]c1ccc([#8])cc1', 0),
|
594 |
+
568: ('[#6]c1ccc([#16])cc1', 0),
|
595 |
+
569: ('[#6]c1ccc([#7])cc1', 0),
|
596 |
+
570: ('[#6]c1ccc(Cl)cc1', 0),
|
597 |
+
571: ('[#6]c1ccc(Br)cc1', 0),
|
598 |
+
572: ('[#8]c1ccc([#8])cc1', 0),
|
599 |
+
573: ('[#8]c1ccc([#16])cc1', 0),
|
600 |
+
574: ('[#8]c1ccc([#7])cc1', 0),
|
601 |
+
575: ('[#8]c1ccc(Cl)cc1', 0),
|
602 |
+
576: ('[#8]c1ccc(Br)cc1', 0),
|
603 |
+
577: ('[#16]c1ccc([#16])cc1', 0),
|
604 |
+
578: ('[#16]c1ccc([#7])cc1', 0),
|
605 |
+
579: ('[#16]c1ccc(Cl)cc1', 0),
|
606 |
+
580: ('[#16]c1ccc(Br)cc1', 0),
|
607 |
+
581: ('[#7]c1ccc([#7])cc1', 0),
|
608 |
+
582: ('[#7]c1ccc(Cl)cc1', 0),
|
609 |
+
583: ('[#7]c1ccc(Br)cc1', 0),
|
610 |
+
584: ('Clc1ccc(Cl)cc1', 0),
|
611 |
+
585: ('Clc1ccc(Br)cc1', 0),
|
612 |
+
586: ('Brc1ccc(Br)cc1', 0),
|
613 |
+
587: ('[#6]c1cc([#6])ccc1', 0),
|
614 |
+
588: ('[#6]c1cc([#8])ccc1', 0),
|
615 |
+
589: ('[#6]c1cc([#16])ccc1', 0),
|
616 |
+
590: ('[#6]c1cc([#7])ccc1', 0),
|
617 |
+
591: ('[#6]c1cc(Cl)ccc1', 0),
|
618 |
+
592: ('[#6]c1cc(Br)ccc1', 0),
|
619 |
+
593: ('[#8]c1cc([#8])ccc1', 0),
|
620 |
+
594: ('[#8]c1cc([#16])ccc1', 0),
|
621 |
+
595: ('[#8]c1cc([#7])ccc1', 0),
|
622 |
+
596: ('[#8]c1cc(Cl)ccc1', 0),
|
623 |
+
597: ('[#8]c1cc(Br)ccc1', 0),
|
624 |
+
598: ('[#16]c1cc([#16])ccc1', 0),
|
625 |
+
599: ('[#16]c1cc([#7])ccc1', 0),
|
626 |
+
600: ('[#16]c1cc(Cl)ccc1', 0),
|
627 |
+
601: ('[#16]c1cc(Br)ccc1', 0),
|
628 |
+
602: ('[#7]c1cc([#7])ccc1', 0),
|
629 |
+
603: ('[#7]c1cc(Cl)ccc1', 0),
|
630 |
+
604: ('[#7]c1cc(Br)ccc1', 0),
|
631 |
+
605: ('Clc1cc(Cl)ccc1', 0),
|
632 |
+
606: ('Clc1cc(Br)ccc1', 0),
|
633 |
+
607: ('Brc1cc(Br)ccc1', 0),
|
634 |
+
608: ('[#6]c1c([#6])cccc1', 0),
|
635 |
+
609: ('[#6]c1c([#8])cccc1', 0),
|
636 |
+
610: ('[#6]c1c([#16])cccc1', 0),
|
637 |
+
611: ('[#6]c1c([#7])cccc1', 0),
|
638 |
+
612: ('[#6]c1c(Cl)cccc1', 0),
|
639 |
+
613: ('[#6]c1c(Br)cccc1', 0),
|
640 |
+
614: ('[#8]c1c([#8])cccc1', 0),
|
641 |
+
615: ('[#8]c1c([#16])cccc1', 0),
|
642 |
+
616: ('[#8]c1c([#7])cccc1', 0),
|
643 |
+
617: ('[#8]c1c(Cl)cccc1', 0),
|
644 |
+
618: ('[#8]c1c(Br)cccc1', 0),
|
645 |
+
619: ('[#16]c1c([#16])cccc1', 0),
|
646 |
+
620: ('[#16]c1c([#7])cccc1', 0),
|
647 |
+
621: ('[#16]c1c(Cl)cccc1', 0),
|
648 |
+
622: ('[#16]c1c(Br)cccc1', 0),
|
649 |
+
623: ('[#7]c1c([#7])cccc1', 0),
|
650 |
+
624: ('[#7]c1c(Cl)cccc1', 0),
|
651 |
+
625: ('[#7]c1c(Br)cccc1', 0),
|
652 |
+
626: ('Clc1c(Cl)cccc1', 0),
|
653 |
+
627: ('Clc1c(Br)cccc1', 0),
|
654 |
+
628: ('Brc1c(Br)cccc1', 0),
|
655 |
+
629: ('[#6][#6]1[#6][#6][#6]([#6])[#6][#6]1', 0),
|
656 |
+
630: ('[#6][#6]1[#6][#6][#6]([#8])[#6][#6]1', 0),
|
657 |
+
631: ('[#6][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
|
658 |
+
632: ('[#6][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
|
659 |
+
633: ('[#6][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
660 |
+
634: ('[#6][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
661 |
+
635: ('[#8][#6]1[#6][#6][#6]([#8])[#6][#6]1', 0),
|
662 |
+
636: ('[#8][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
|
663 |
+
637: ('[#8][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
|
664 |
+
638: ('[#8][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
665 |
+
639: ('[#8][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
666 |
+
640: ('[#16][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
|
667 |
+
641: ('[#16][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
|
668 |
+
642: ('[#16][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
669 |
+
643: ('[#16][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
670 |
+
644: ('[#7][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
|
671 |
+
645: ('[#7][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
672 |
+
646: ('[#7][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
673 |
+
647: ('Cl[#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
674 |
+
648: ('Cl[#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
675 |
+
649: ('Br[#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
676 |
+
650: ('[#6][#6]1[#6][#6]([#6])[#6][#6][#6]1', 0),
|
677 |
+
651: ('[#6][#6]1[#6][#6]([#8])[#6][#6][#6]1', 0),
|
678 |
+
652: ('[#6][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
|
679 |
+
653: ('[#6][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
|
680 |
+
654: ('[#6][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
681 |
+
655: ('[#6][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
682 |
+
656: ('[#8][#6]1[#6][#6]([#8])[#6][#6][#6]1', 0),
|
683 |
+
657: ('[#8][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
|
684 |
+
658: ('[#8][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
|
685 |
+
659: ('[#8][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
686 |
+
660: ('[#8][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
687 |
+
661: ('[#16][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
|
688 |
+
662: ('[#16][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
|
689 |
+
663: ('[#16][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
690 |
+
664: ('[#16][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
691 |
+
665: ('[#7][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
|
692 |
+
666: ('[#7][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
693 |
+
667: ('[#7][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
694 |
+
668: ('Cl[#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
695 |
+
669: ('Cl[#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
696 |
+
670: ('Br[#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
697 |
+
671: ('[#6][#6]1[#6]([#6])[#6][#6][#6][#6]1', 0),
|
698 |
+
672: ('[#6][#6]1[#6]([#8])[#6][#6][#6][#6]1', 0),
|
699 |
+
673: ('[#6][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
|
700 |
+
674: ('[#6][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
|
701 |
+
675: ('[#6][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
702 |
+
676: ('[#6][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
703 |
+
677: ('[#8][#6]1[#6]([#8])[#6][#6][#6][#6]1', 0),
|
704 |
+
678: ('[#8][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
|
705 |
+
679: ('[#8][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
|
706 |
+
680: ('[#8][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
707 |
+
681: ('[#8][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
708 |
+
682: ('[#16][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
|
709 |
+
683: ('[#16][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
|
710 |
+
684: ('[#16][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
711 |
+
685: ('[#16][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
712 |
+
686: ('[#7][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
|
713 |
+
687: ('[#7][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
714 |
+
688: ('[#7][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
715 |
+
689: ('Cl[#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
716 |
+
690: ('Cl[#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
717 |
+
691: ('Br[#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
718 |
+
692: ('[#6][#6]1[#6][#6]([#6])[#6][#6]1', 0),
|
719 |
+
693: ('[#6][#6]1[#6][#6]([#8])[#6][#6]1', 0),
|
720 |
+
694: ('[#6][#6]1[#6][#6]([#16])[#6][#6]1', 0),
|
721 |
+
695: ('[#6][#6]1[#6][#6]([#7])[#6][#6]1', 0),
|
722 |
+
696: ('[#6][#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
723 |
+
697: ('[#6][#6]1[#6][#6](Br)[#6][#6]1', 0),
|
724 |
+
698: ('[#8][#6]1[#6][#6]([#8])[#6][#6]1', 0),
|
725 |
+
699: ('[#8][#6]1[#6][#6]([#16])[#6][#6]1', 0),
|
726 |
+
700: ('[#8][#6]1[#6][#6]([#7])[#6][#6]1', 0),
|
727 |
+
701: ('[#8][#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
728 |
+
702: ('[#8][#6]1[#6][#6](Br)[#6][#6]1', 0),
|
729 |
+
703: ('[#16][#6]1[#6][#6]([#16])[#6][#6]1', 0),
|
730 |
+
704: ('[#16][#6]1[#6][#6]([#7])[#6][#6]1', 0),
|
731 |
+
705: ('[#16][#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
732 |
+
706: ('[#16][#6]1[#6][#6](Br)[#6][#6]1', 0),
|
733 |
+
707: ('[#7][#6]1[#6][#6]([#7])[#6][#6]1', 0),
|
734 |
+
708: ('[#7][#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
735 |
+
709: ('[#7][#6]1[#6][#6](Br)[#6][#6]1', 0),
|
736 |
+
710: ('Cl[#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
737 |
+
711: ('Cl[#6]1[#6][#6](Br)[#6][#6]1', 0),
|
738 |
+
712: ('Br[#6]1[#6][#6](Br)[#6][#6]1', 0),
|
739 |
+
713: ('[#6][#6]1[#6]([#6])[#6][#6][#6]1', 0),
|
740 |
+
714: ('[#6][#6]1[#6]([#8])[#6][#6][#6]1', 0),
|
741 |
+
715: ('[#6][#6]1[#6]([#16])[#6][#6][#6]1', 0),
|
742 |
+
716: ('[#6][#6]1[#6]([#7])[#6][#6][#6]1', 0),
|
743 |
+
717: ('[#6][#6]1[#6](Cl)[#6][#6][#6]1', 0),
|
744 |
+
718: ('[#6][#6]1[#6](Br)[#6][#6][#6]1', 0),
|
745 |
+
719: ('[#8][#6]1[#6]([#8])[#6][#6][#6]1', 0),
|
746 |
+
720: ('[#8][#6]1[#6]([#16])[#6][#6][#6]1', 0),
|
747 |
+
721: ('[#8][#6]1[#6]([#7])[#6][#6][#6]1', 0),
|
748 |
+
722: ('[#8][#6]1[#6](Cl)[#6][#6][#6]1', 0),
|
749 |
+
723: ('[#8][#6]1[#6](Br)[#6][#6][#6]1', 0),
|
750 |
+
724: ('[#16][#6]1[#6]([#16])[#6][#6][#6]1', 0),
|
751 |
+
725: ('[#16][#6]1[#6]([#7])[#6][#6][#6]1', 0),
|
752 |
+
726: ('[#16][#6]1[#6](Cl)[#6][#6][#6]1', 0),
|
753 |
+
727: ('[#16][#6]1[#6](Br)[#6][#6][#6]1', 0),
|
754 |
+
728: ('[#7][#6]1[#6]([#7])[#6][#6][#6]1', 0),
|
755 |
+
729: ('[#7][#6]1[#6](Cl)[#6][#6]1', 0),
|
756 |
+
730: ('[#7][#6]1[#6](Br)[#6][#6][#6]1', 0),
|
757 |
+
731: ('Cl[#6]1[#6](Cl)[#6][#6][#6]1', 0),
|
758 |
+
732: ('Cl[#6]1[#6](Br)[#6][#6][#6]1', 0),
|
759 |
+
733: ('Br[#6]1[#6](Br)[#6][#6][#6]1', 0)}
|
760 |
+
|
761 |
+
PubchemKeys = None
|
762 |
+
|
763 |
+
|
764 |
+
def InitKeys(keyList, keyDict):
|
765 |
+
""" *Internal Use Only*
|
766 |
+
generates SMARTS patterns for the keys, run once
|
767 |
+
"""
|
768 |
+
assert len(keyList) == len(keyDict.keys()), 'length mismatch'
|
769 |
+
for key in keyDict.keys():
|
770 |
+
patt, count = keyDict[key]
|
771 |
+
if patt != '?':
|
772 |
+
sma = Chem.MolFromSmarts(patt)
|
773 |
+
if not sma:
|
774 |
+
print('SMARTS parser error for key #%d: %s' % (key, patt))
|
775 |
+
else:
|
776 |
+
keyList[key - 1] = sma, count
|
777 |
+
|
778 |
+
|
779 |
+
def calcPubChemFingerPart1(mol, **kwargs):
|
780 |
+
""" Calculate PubChem Fingerprints (1-115; 263-881)
|
781 |
+
**Arguments**
|
782 |
+
- mol: the molecule to be fingerprinted
|
783 |
+
- any extra keyword arguments are ignored
|
784 |
+
**Returns**
|
785 |
+
a _DataStructs.SparseBitVect_ containing the fingerprint.
|
786 |
+
>>> m = Chem.MolFromSmiles('CNO')
|
787 |
+
>>> bv = PubChemFingerPart1(m)
|
788 |
+
>>> tuple(bv.GetOnBits())
|
789 |
+
(24, 68, 69, 71, 93, 94, 102, 124, 131, 139, 151, 158, 160, 161, 164)
|
790 |
+
>>> bv = PubChemFingerPart1(Chem.MolFromSmiles('CCC'))
|
791 |
+
>>> tuple(bv.GetOnBits())
|
792 |
+
(74, 114, 149, 155, 160)
|
793 |
+
"""
|
794 |
+
global PubchemKeys
|
795 |
+
if PubchemKeys is None:
|
796 |
+
PubchemKeys = [(None, 0)] * len(smartsPatts.keys())
|
797 |
+
InitKeys(PubchemKeys, smartsPatts)
|
798 |
+
ctor = kwargs.get('ctor', DataStructs.SparseBitVect)
|
799 |
+
res = ctor(len(PubchemKeys) + 1)
|
800 |
+
for i, (patt, count) in enumerate(PubchemKeys):
|
801 |
+
if patt is not None:
|
802 |
+
if count == 0:
|
803 |
+
res[i + 1] = mol.HasSubstructMatch(patt)
|
804 |
+
else:
|
805 |
+
matches = mol.GetSubstructMatches(patt)
|
806 |
+
if len(matches) > count:
|
807 |
+
res[i + 1] = 1
|
808 |
+
return res
|
809 |
+
|
810 |
+
|
811 |
+
def func_1(mol, bits):
|
812 |
+
""" *Internal Use Only*
|
813 |
+
Calculate PubChem Fingerprints (116-263)
|
814 |
+
"""
|
815 |
+
ringSize = []
|
816 |
+
temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
|
817 |
+
AllRingsAtom = mol.GetRingInfo().AtomRings()
|
818 |
+
for ring in AllRingsAtom:
|
819 |
+
ringSize.append(len(ring))
|
820 |
+
for k, v in temp.items():
|
821 |
+
if len(ring) == k:
|
822 |
+
temp[k] += 1
|
823 |
+
if temp[3] >= 2:
|
824 |
+
bits[0] = 1
|
825 |
+
bits[7] = 1
|
826 |
+
elif temp[3] == 1:
|
827 |
+
bits[0] = 1
|
828 |
+
else:
|
829 |
+
pass
|
830 |
+
if temp[4] >= 2:
|
831 |
+
bits[14] = 1
|
832 |
+
bits[21] = 1
|
833 |
+
elif temp[4] == 1:
|
834 |
+
bits[14] = 1
|
835 |
+
else:
|
836 |
+
pass
|
837 |
+
if temp[5] >= 5:
|
838 |
+
bits[28] = 1
|
839 |
+
bits[35] = 1
|
840 |
+
bits[42] = 1
|
841 |
+
bits[49] = 1
|
842 |
+
bits[56] = 1
|
843 |
+
elif temp[5] == 4:
|
844 |
+
bits[28] = 1
|
845 |
+
bits[35] = 1
|
846 |
+
bits[42] = 1
|
847 |
+
bits[49] = 1
|
848 |
+
elif temp[5] == 3:
|
849 |
+
bits[28] = 1
|
850 |
+
bits[35] = 1
|
851 |
+
bits[42] = 1
|
852 |
+
elif temp[5] == 2:
|
853 |
+
bits[28] = 1
|
854 |
+
bits[35] = 1
|
855 |
+
elif temp[5] == 1:
|
856 |
+
bits[28] = 1
|
857 |
+
else:
|
858 |
+
pass
|
859 |
+
if temp[6] >= 5:
|
860 |
+
bits[63] = 1
|
861 |
+
bits[70] = 1
|
862 |
+
bits[77] = 1
|
863 |
+
bits[84] = 1
|
864 |
+
bits[91] = 1
|
865 |
+
elif temp[6] == 4:
|
866 |
+
bits[63] = 1
|
867 |
+
bits[70] = 1
|
868 |
+
bits[77] = 1
|
869 |
+
bits[84] = 1
|
870 |
+
elif temp[6] == 3:
|
871 |
+
bits[63] = 1
|
872 |
+
bits[70] = 1
|
873 |
+
bits[77] = 1
|
874 |
+
elif temp[6] == 2:
|
875 |
+
bits[63] = 1
|
876 |
+
bits[70] = 1
|
877 |
+
elif temp[6] == 1:
|
878 |
+
bits[63] = 1
|
879 |
+
else:
|
880 |
+
pass
|
881 |
+
if temp[7] >= 2:
|
882 |
+
bits[98] = 1
|
883 |
+
bits[105] = 1
|
884 |
+
elif temp[7] == 1:
|
885 |
+
bits[98] = 1
|
886 |
+
else:
|
887 |
+
pass
|
888 |
+
if temp[8] >= 2:
|
889 |
+
bits[112] = 1
|
890 |
+
bits[119] = 1
|
891 |
+
elif temp[8] == 1:
|
892 |
+
bits[112] = 1
|
893 |
+
else:
|
894 |
+
pass
|
895 |
+
if temp[9] >= 1:
|
896 |
+
bits[126] = 1
|
897 |
+
else:
|
898 |
+
pass
|
899 |
+
if temp[10] >= 1:
|
900 |
+
bits[133] = 1
|
901 |
+
else:
|
902 |
+
pass
|
903 |
+
|
904 |
+
return ringSize, bits
|
905 |
+
|
906 |
+
|
907 |
+
def func_2(mol, bits):
|
908 |
+
""" *Internal Use Only*
|
909 |
+
saturated or aromatic carbon-only ring
|
910 |
+
"""
|
911 |
+
AllRingsBond = mol.GetRingInfo().BondRings()
|
912 |
+
ringSize = []
|
913 |
+
temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
|
914 |
+
for ring in AllRingsBond:
|
915 |
+
######### saturated
|
916 |
+
nonsingle = False
|
917 |
+
for bondIdx in ring:
|
918 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
|
919 |
+
nonsingle = True
|
920 |
+
break
|
921 |
+
if nonsingle == False:
|
922 |
+
ringSize.append(len(ring))
|
923 |
+
for k, v in temp.items():
|
924 |
+
if len(ring) == k:
|
925 |
+
temp[k] += 1
|
926 |
+
######## aromatic carbon-only
|
927 |
+
aromatic = True
|
928 |
+
AllCarb = True
|
929 |
+
for bondIdx in ring:
|
930 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'AROMATIC':
|
931 |
+
aromatic = False
|
932 |
+
break
|
933 |
+
for bondIdx in ring:
|
934 |
+
BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
|
935 |
+
EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
|
936 |
+
if BeginAtom.GetAtomicNum() != 6 or EndAtom.GetAtomicNum() != 6:
|
937 |
+
AllCarb = False
|
938 |
+
break
|
939 |
+
if aromatic == True and AllCarb == True:
|
940 |
+
ringSize.append(len(ring))
|
941 |
+
for k, v in temp.items():
|
942 |
+
if len(ring) == k:
|
943 |
+
temp[k] += 1
|
944 |
+
if temp[3] >= 2:
|
945 |
+
bits[1] = 1
|
946 |
+
bits[8] = 1
|
947 |
+
elif temp[3] == 1:
|
948 |
+
bits[1] = 1
|
949 |
+
else:
|
950 |
+
pass
|
951 |
+
if temp[4] >= 2:
|
952 |
+
bits[15] = 1
|
953 |
+
bits[22] = 1
|
954 |
+
elif temp[4] == 1:
|
955 |
+
bits[15] = 1
|
956 |
+
else:
|
957 |
+
pass
|
958 |
+
if temp[5] >= 5:
|
959 |
+
bits[29] = 1
|
960 |
+
bits[36] = 1
|
961 |
+
bits[43] = 1
|
962 |
+
bits[50] = 1
|
963 |
+
bits[57] = 1
|
964 |
+
elif temp[5] == 4:
|
965 |
+
bits[29] = 1
|
966 |
+
bits[36] = 1
|
967 |
+
bits[43] = 1
|
968 |
+
bits[50] = 1
|
969 |
+
elif temp[5] == 3:
|
970 |
+
bits[29] = 1
|
971 |
+
bits[36] = 1
|
972 |
+
bits[43] = 1
|
973 |
+
elif temp[5] == 2:
|
974 |
+
bits[29] = 1
|
975 |
+
bits[36] = 1
|
976 |
+
elif temp[5] == 1:
|
977 |
+
bits[29] = 1
|
978 |
+
else:
|
979 |
+
pass
|
980 |
+
if temp[6] >= 5:
|
981 |
+
bits[64] = 1
|
982 |
+
bits[71] = 1
|
983 |
+
bits[78] = 1
|
984 |
+
bits[85] = 1
|
985 |
+
bits[92] = 1
|
986 |
+
elif temp[6] == 4:
|
987 |
+
bits[64] = 1
|
988 |
+
bits[71] = 1
|
989 |
+
bits[78] = 1
|
990 |
+
bits[85] = 1
|
991 |
+
elif temp[6] == 3:
|
992 |
+
bits[64] = 1
|
993 |
+
bits[71] = 1
|
994 |
+
bits[78] = 1
|
995 |
+
elif temp[6] == 2:
|
996 |
+
bits[64] = 1
|
997 |
+
bits[71] = 1
|
998 |
+
elif temp[6] == 1:
|
999 |
+
bits[64] = 1
|
1000 |
+
else:
|
1001 |
+
pass
|
1002 |
+
if temp[7] >= 2:
|
1003 |
+
bits[99] = 1
|
1004 |
+
bits[106] = 1
|
1005 |
+
elif temp[7] == 1:
|
1006 |
+
bits[99] = 1
|
1007 |
+
else:
|
1008 |
+
pass
|
1009 |
+
if temp[8] >= 2:
|
1010 |
+
bits[113] = 1
|
1011 |
+
bits[120] = 1
|
1012 |
+
elif temp[8] == 1:
|
1013 |
+
bits[113] = 1
|
1014 |
+
else:
|
1015 |
+
pass
|
1016 |
+
if temp[9] >= 1:
|
1017 |
+
bits[127] = 1
|
1018 |
+
else:
|
1019 |
+
pass
|
1020 |
+
if temp[10] >= 1:
|
1021 |
+
bits[134] = 1
|
1022 |
+
else:
|
1023 |
+
pass
|
1024 |
+
return ringSize, bits
|
1025 |
+
|
1026 |
+
|
1027 |
+
def func_3(mol, bits):
|
1028 |
+
""" *Internal Use Only*
|
1029 |
+
saturated or aromatic nitrogen-containing
|
1030 |
+
"""
|
1031 |
+
AllRingsBond = mol.GetRingInfo().BondRings()
|
1032 |
+
ringSize = []
|
1033 |
+
temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
|
1034 |
+
for ring in AllRingsBond:
|
1035 |
+
######### saturated
|
1036 |
+
nonsingle = False
|
1037 |
+
for bondIdx in ring:
|
1038 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
|
1039 |
+
nonsingle = True
|
1040 |
+
break
|
1041 |
+
if nonsingle == False:
|
1042 |
+
ringSize.append(len(ring))
|
1043 |
+
for k, v in temp.items():
|
1044 |
+
if len(ring) == k:
|
1045 |
+
temp[k] += 1
|
1046 |
+
######## aromatic nitrogen-containing
|
1047 |
+
aromatic = True
|
1048 |
+
ContainNitro = False
|
1049 |
+
for bondIdx in ring:
|
1050 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'AROMATIC':
|
1051 |
+
aromatic = False
|
1052 |
+
break
|
1053 |
+
for bondIdx in ring:
|
1054 |
+
BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
|
1055 |
+
EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
|
1056 |
+
if BeginAtom.GetAtomicNum() == 7 or EndAtom.GetAtomicNum() == 7:
|
1057 |
+
ContainNitro = True
|
1058 |
+
break
|
1059 |
+
if aromatic == True and ContainNitro == True:
|
1060 |
+
ringSize.append(len(ring))
|
1061 |
+
for k, v in temp.items():
|
1062 |
+
if len(ring) == k:
|
1063 |
+
temp[k] += 1
|
1064 |
+
if temp[3] >= 2:
|
1065 |
+
bits[2] = 1
|
1066 |
+
bits[9] = 1
|
1067 |
+
elif temp[3] == 1:
|
1068 |
+
bits[2] = 1
|
1069 |
+
else:
|
1070 |
+
pass
|
1071 |
+
if temp[4] >= 2:
|
1072 |
+
bits[16] = 1
|
1073 |
+
bits[23] = 1
|
1074 |
+
elif temp[4] == 1:
|
1075 |
+
bits[16] = 1
|
1076 |
+
else:
|
1077 |
+
pass
|
1078 |
+
if temp[5] >= 5:
|
1079 |
+
bits[30] = 1
|
1080 |
+
bits[37] = 1
|
1081 |
+
bits[44] = 1
|
1082 |
+
bits[51] = 1
|
1083 |
+
bits[58] = 1
|
1084 |
+
elif temp[5] == 4:
|
1085 |
+
bits[30] = 1
|
1086 |
+
bits[37] = 1
|
1087 |
+
bits[44] = 1
|
1088 |
+
bits[51] = 1
|
1089 |
+
elif temp[5] == 3:
|
1090 |
+
bits[30] = 1
|
1091 |
+
bits[37] = 1
|
1092 |
+
bits[44] = 1
|
1093 |
+
elif temp[5] == 2:
|
1094 |
+
bits[30] = 1
|
1095 |
+
bits[37] = 1
|
1096 |
+
elif temp[5] == 1:
|
1097 |
+
bits[30] = 1
|
1098 |
+
else:
|
1099 |
+
pass
|
1100 |
+
if temp[6] >= 5:
|
1101 |
+
bits[65] = 1
|
1102 |
+
bits[72] = 1
|
1103 |
+
bits[79] = 1
|
1104 |
+
bits[86] = 1
|
1105 |
+
bits[93] = 1
|
1106 |
+
elif temp[6] == 4:
|
1107 |
+
bits[65] = 1
|
1108 |
+
bits[72] = 1
|
1109 |
+
bits[79] = 1
|
1110 |
+
bits[86] = 1
|
1111 |
+
elif temp[6] == 3:
|
1112 |
+
bits[65] = 1
|
1113 |
+
bits[72] = 1
|
1114 |
+
bits[79] = 1
|
1115 |
+
elif temp[6] == 2:
|
1116 |
+
bits[65] = 1
|
1117 |
+
bits[72] = 1
|
1118 |
+
elif temp[6] == 1:
|
1119 |
+
bits[65] = 1
|
1120 |
+
else:
|
1121 |
+
pass
|
1122 |
+
if temp[7] >= 2:
|
1123 |
+
bits[100] = 1
|
1124 |
+
bits[107] = 1
|
1125 |
+
elif temp[7] == 1:
|
1126 |
+
bits[100] = 1
|
1127 |
+
else:
|
1128 |
+
pass
|
1129 |
+
if temp[8] >= 2:
|
1130 |
+
bits[114] = 1
|
1131 |
+
bits[121] = 1
|
1132 |
+
elif temp[8] == 1:
|
1133 |
+
bits[114] = 1
|
1134 |
+
else:
|
1135 |
+
pass
|
1136 |
+
if temp[9] >= 1:
|
1137 |
+
bits[128] = 1
|
1138 |
+
else:
|
1139 |
+
pass
|
1140 |
+
if temp[10] >= 1:
|
1141 |
+
bits[135] = 1
|
1142 |
+
else:
|
1143 |
+
pass
|
1144 |
+
return ringSize, bits
|
1145 |
+
|
1146 |
+
|
1147 |
+
def func_4(mol, bits):
|
1148 |
+
""" *Internal Use Only*
|
1149 |
+
saturated or aromatic heteroatom-containing
|
1150 |
+
"""
|
1151 |
+
AllRingsBond = mol.GetRingInfo().BondRings()
|
1152 |
+
ringSize = []
|
1153 |
+
temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
|
1154 |
+
for ring in AllRingsBond:
|
1155 |
+
######### saturated
|
1156 |
+
nonsingle = False
|
1157 |
+
for bondIdx in ring:
|
1158 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
|
1159 |
+
nonsingle = True
|
1160 |
+
break
|
1161 |
+
if nonsingle == False:
|
1162 |
+
ringSize.append(len(ring))
|
1163 |
+
for k, v in temp.items():
|
1164 |
+
if len(ring) == k:
|
1165 |
+
temp[k] += 1
|
1166 |
+
######## aromatic heteroatom-containing
|
1167 |
+
aromatic = True
|
1168 |
+
heteroatom = False
|
1169 |
+
for bondIdx in ring:
|
1170 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'AROMATIC':
|
1171 |
+
aromatic = False
|
1172 |
+
break
|
1173 |
+
for bondIdx in ring:
|
1174 |
+
BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
|
1175 |
+
EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
|
1176 |
+
if BeginAtom.GetAtomicNum() not in [1, 6] or EndAtom.GetAtomicNum() not in [1, 6]:
|
1177 |
+
heteroatom = True
|
1178 |
+
break
|
1179 |
+
if aromatic == True and heteroatom == True:
|
1180 |
+
ringSize.append(len(ring))
|
1181 |
+
for k, v in temp.items():
|
1182 |
+
if len(ring) == k:
|
1183 |
+
temp[k] += 1
|
1184 |
+
if temp[3] >= 2:
|
1185 |
+
bits[3] = 1
|
1186 |
+
bits[10] = 1
|
1187 |
+
elif temp[3] == 1:
|
1188 |
+
bits[3] = 1
|
1189 |
+
else:
|
1190 |
+
pass
|
1191 |
+
if temp[4] >= 2:
|
1192 |
+
bits[17] = 1
|
1193 |
+
bits[24] = 1
|
1194 |
+
elif temp[4] == 1:
|
1195 |
+
bits[17] = 1
|
1196 |
+
else:
|
1197 |
+
pass
|
1198 |
+
if temp[5] >= 5:
|
1199 |
+
bits[31] = 1
|
1200 |
+
bits[38] = 1
|
1201 |
+
bits[45] = 1
|
1202 |
+
bits[52] = 1
|
1203 |
+
bits[59] = 1
|
1204 |
+
elif temp[5] == 4:
|
1205 |
+
bits[31] = 1
|
1206 |
+
bits[38] = 1
|
1207 |
+
bits[45] = 1
|
1208 |
+
bits[52] = 1
|
1209 |
+
elif temp[5] == 3:
|
1210 |
+
bits[31] = 1
|
1211 |
+
bits[38] = 1
|
1212 |
+
bits[45] = 1
|
1213 |
+
elif temp[5] == 2:
|
1214 |
+
bits[31] = 1
|
1215 |
+
bits[38] = 1
|
1216 |
+
elif temp[5] == 1:
|
1217 |
+
bits[31] = 1
|
1218 |
+
else:
|
1219 |
+
pass
|
1220 |
+
if temp[6] >= 5:
|
1221 |
+
bits[66] = 1
|
1222 |
+
bits[73] = 1
|
1223 |
+
bits[80] = 1
|
1224 |
+
bits[87] = 1
|
1225 |
+
bits[94] = 1
|
1226 |
+
elif temp[6] == 4:
|
1227 |
+
bits[66] = 1
|
1228 |
+
bits[73] = 1
|
1229 |
+
bits[80] = 1
|
1230 |
+
bits[87] = 1
|
1231 |
+
elif temp[6] == 3:
|
1232 |
+
bits[66] = 1
|
1233 |
+
bits[73] = 1
|
1234 |
+
bits[80] = 1
|
1235 |
+
elif temp[6] == 2:
|
1236 |
+
bits[66] = 1
|
1237 |
+
bits[73] = 1
|
1238 |
+
elif temp[6] == 1:
|
1239 |
+
bits[66] = 1
|
1240 |
+
else:
|
1241 |
+
pass
|
1242 |
+
if temp[7] >= 2:
|
1243 |
+
bits[101] = 1
|
1244 |
+
bits[108] = 1
|
1245 |
+
elif temp[7] == 1:
|
1246 |
+
bits[101] = 1
|
1247 |
+
else:
|
1248 |
+
pass
|
1249 |
+
if temp[8] >= 2:
|
1250 |
+
bits[115] = 1
|
1251 |
+
bits[122] = 1
|
1252 |
+
elif temp[8] == 1:
|
1253 |
+
bits[115] = 1
|
1254 |
+
else:
|
1255 |
+
pass
|
1256 |
+
if temp[9] >= 1:
|
1257 |
+
bits[129] = 1
|
1258 |
+
else:
|
1259 |
+
pass
|
1260 |
+
if temp[10] >= 1:
|
1261 |
+
bits[136] = 1
|
1262 |
+
else:
|
1263 |
+
pass
|
1264 |
+
return ringSize, bits
|
1265 |
+
|
1266 |
+
|
1267 |
+
def func_5(mol, bits):
|
1268 |
+
""" *Internal Use Only*
|
1269 |
+
unsaturated non-aromatic carbon-only
|
1270 |
+
"""
|
1271 |
+
ringSize = []
|
1272 |
+
AllRingsBond = mol.GetRingInfo().BondRings()
|
1273 |
+
temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
|
1274 |
+
for ring in AllRingsBond:
|
1275 |
+
unsaturated = False
|
1276 |
+
nonaromatic = True
|
1277 |
+
Allcarb = True
|
1278 |
+
######### unsaturated
|
1279 |
+
for bondIdx in ring:
|
1280 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
|
1281 |
+
unsaturated = True
|
1282 |
+
break
|
1283 |
+
######## non-aromatic
|
1284 |
+
for bondIdx in ring:
|
1285 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name == 'AROMATIC':
|
1286 |
+
nonaromatic = False
|
1287 |
+
break
|
1288 |
+
######## allcarb
|
1289 |
+
for bondIdx in ring:
|
1290 |
+
BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
|
1291 |
+
EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
|
1292 |
+
if BeginAtom.GetAtomicNum() != 6 or EndAtom.GetAtomicNum() != 6:
|
1293 |
+
Allcarb = False
|
1294 |
+
break
|
1295 |
+
if unsaturated == True and nonaromatic == True and Allcarb == True:
|
1296 |
+
ringSize.append(len(ring))
|
1297 |
+
for k, v in temp.items():
|
1298 |
+
if len(ring) == k:
|
1299 |
+
temp[k] += 1
|
1300 |
+
if temp[3] >= 2:
|
1301 |
+
bits[4] = 1
|
1302 |
+
bits[11] = 1
|
1303 |
+
elif temp[3] == 1:
|
1304 |
+
bits[4] = 1
|
1305 |
+
else:
|
1306 |
+
pass
|
1307 |
+
if temp[4] >= 2:
|
1308 |
+
bits[18] = 1
|
1309 |
+
bits[25] = 1
|
1310 |
+
elif temp[4] == 1:
|
1311 |
+
bits[18] = 1
|
1312 |
+
else:
|
1313 |
+
pass
|
1314 |
+
if temp[5] >= 5:
|
1315 |
+
bits[32] = 1
|
1316 |
+
bits[39] = 1
|
1317 |
+
bits[46] = 1
|
1318 |
+
bits[53] = 1
|
1319 |
+
bits[60] = 1
|
1320 |
+
elif temp[5] == 4:
|
1321 |
+
bits[32] = 1
|
1322 |
+
bits[39] = 1
|
1323 |
+
bits[46] = 1
|
1324 |
+
bits[53] = 1
|
1325 |
+
elif temp[5] == 3:
|
1326 |
+
bits[32] = 1
|
1327 |
+
bits[39] = 1
|
1328 |
+
bits[46] = 1
|
1329 |
+
elif temp[5] == 2:
|
1330 |
+
bits[32] = 1
|
1331 |
+
bits[39] = 1
|
1332 |
+
elif temp[5] == 1:
|
1333 |
+
bits[32] = 1
|
1334 |
+
else:
|
1335 |
+
pass
|
1336 |
+
if temp[6] >= 5:
|
1337 |
+
bits[67] = 1
|
1338 |
+
bits[74] = 1
|
1339 |
+
bits[81] = 1
|
1340 |
+
bits[88] = 1
|
1341 |
+
bits[95] = 1
|
1342 |
+
elif temp[6] == 4:
|
1343 |
+
bits[67] = 1
|
1344 |
+
bits[74] = 1
|
1345 |
+
bits[81] = 1
|
1346 |
+
bits[88] = 1
|
1347 |
+
elif temp[6] == 3:
|
1348 |
+
bits[67] = 1
|
1349 |
+
bits[74] = 1
|
1350 |
+
bits[81] = 1
|
1351 |
+
elif temp[6] == 2:
|
1352 |
+
bits[67] = 1
|
1353 |
+
bits[74] = 1
|
1354 |
+
elif temp[6] == 1:
|
1355 |
+
bits[67] = 1
|
1356 |
+
else:
|
1357 |
+
pass
|
1358 |
+
if temp[7] >= 2:
|
1359 |
+
bits[102] = 1
|
1360 |
+
bits[109] = 1
|
1361 |
+
elif temp[7] == 1:
|
1362 |
+
bits[102] = 1
|
1363 |
+
else:
|
1364 |
+
pass
|
1365 |
+
if temp[8] >= 2:
|
1366 |
+
bits[116] = 1
|
1367 |
+
bits[123] = 1
|
1368 |
+
elif temp[8] == 1:
|
1369 |
+
bits[116] = 1
|
1370 |
+
else:
|
1371 |
+
pass
|
1372 |
+
if temp[9] >= 1:
|
1373 |
+
bits[130] = 1
|
1374 |
+
else:
|
1375 |
+
pass
|
1376 |
+
if temp[10] >= 1:
|
1377 |
+
bits[137] = 1
|
1378 |
+
else:
|
1379 |
+
pass
|
1380 |
+
return ringSize, bits
|
1381 |
+
|
1382 |
+
|
1383 |
+
def func_6(mol, bits):
|
1384 |
+
""" *Internal Use Only*
|
1385 |
+
unsaturated non-aromatic nitrogen-containing
|
1386 |
+
"""
|
1387 |
+
ringSize = []
|
1388 |
+
AllRingsBond = mol.GetRingInfo().BondRings()
|
1389 |
+
temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
|
1390 |
+
for ring in AllRingsBond:
|
1391 |
+
unsaturated = False
|
1392 |
+
nonaromatic = True
|
1393 |
+
ContainNitro = False
|
1394 |
+
######### unsaturated
|
1395 |
+
for bondIdx in ring:
|
1396 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
|
1397 |
+
unsaturated = True
|
1398 |
+
break
|
1399 |
+
######## non-aromatic
|
1400 |
+
for bondIdx in ring:
|
1401 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name == 'AROMATIC':
|
1402 |
+
nonaromatic = False
|
1403 |
+
break
|
1404 |
+
######## nitrogen-containing
|
1405 |
+
for bondIdx in ring:
|
1406 |
+
BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
|
1407 |
+
EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
|
1408 |
+
if BeginAtom.GetAtomicNum() == 7 or EndAtom.GetAtomicNum() == 7:
|
1409 |
+
ContainNitro = True
|
1410 |
+
break
|
1411 |
+
if unsaturated == True and nonaromatic == True and ContainNitro == True:
|
1412 |
+
ringSize.append(len(ring))
|
1413 |
+
for k, v in temp.items():
|
1414 |
+
if len(ring) == k:
|
1415 |
+
temp[k] += 1
|
1416 |
+
if temp[3] >= 2:
|
1417 |
+
bits[5] = 1
|
1418 |
+
bits[12] = 1
|
1419 |
+
elif temp[3] == 1:
|
1420 |
+
bits[5] = 1
|
1421 |
+
else:
|
1422 |
+
pass
|
1423 |
+
if temp[4] >= 2:
|
1424 |
+
bits[19] = 1
|
1425 |
+
bits[26] = 1
|
1426 |
+
elif temp[4] == 1:
|
1427 |
+
bits[19] = 1
|
1428 |
+
else:
|
1429 |
+
pass
|
1430 |
+
if temp[5] >= 5:
|
1431 |
+
bits[33] = 1
|
1432 |
+
bits[40] = 1
|
1433 |
+
bits[47] = 1
|
1434 |
+
bits[54] = 1
|
1435 |
+
bits[61] = 1
|
1436 |
+
elif temp[5] == 4:
|
1437 |
+
bits[33] = 1
|
1438 |
+
bits[40] = 1
|
1439 |
+
bits[47] = 1
|
1440 |
+
bits[54] = 1
|
1441 |
+
elif temp[5] == 3:
|
1442 |
+
bits[33] = 1
|
1443 |
+
bits[40] = 1
|
1444 |
+
bits[47] = 1
|
1445 |
+
elif temp[5] == 2:
|
1446 |
+
bits[33] = 1
|
1447 |
+
bits[40] = 1
|
1448 |
+
elif temp[5] == 1:
|
1449 |
+
bits[33] = 1
|
1450 |
+
else:
|
1451 |
+
pass
|
1452 |
+
if temp[6] >= 5:
|
1453 |
+
bits[68] = 1
|
1454 |
+
bits[75] = 1
|
1455 |
+
bits[82] = 1
|
1456 |
+
bits[89] = 1
|
1457 |
+
bits[96] = 1
|
1458 |
+
elif temp[6] == 4:
|
1459 |
+
bits[68] = 1
|
1460 |
+
bits[75] = 1
|
1461 |
+
bits[82] = 1
|
1462 |
+
bits[89] = 1
|
1463 |
+
elif temp[6] == 3:
|
1464 |
+
bits[68] = 1
|
1465 |
+
bits[75] = 1
|
1466 |
+
bits[82] = 1
|
1467 |
+
elif temp[6] == 2:
|
1468 |
+
bits[68] = 1
|
1469 |
+
bits[75] = 1
|
1470 |
+
elif temp[6] == 1:
|
1471 |
+
bits[68] = 1
|
1472 |
+
else:
|
1473 |
+
pass
|
1474 |
+
if temp[7] >= 2:
|
1475 |
+
bits[103] = 1
|
1476 |
+
bits[110] = 1
|
1477 |
+
elif temp[7] == 1:
|
1478 |
+
bits[103] = 1
|
1479 |
+
else:
|
1480 |
+
pass
|
1481 |
+
if temp[8] >= 2:
|
1482 |
+
bits[117] = 1
|
1483 |
+
bits[124] = 1
|
1484 |
+
elif temp[8] == 1:
|
1485 |
+
bits[117] = 1
|
1486 |
+
else:
|
1487 |
+
pass
|
1488 |
+
if temp[9] >= 1:
|
1489 |
+
bits[131] = 1
|
1490 |
+
else:
|
1491 |
+
pass
|
1492 |
+
if temp[10] >= 1:
|
1493 |
+
bits[138] = 1
|
1494 |
+
else:
|
1495 |
+
pass
|
1496 |
+
return ringSize, bits
|
1497 |
+
|
1498 |
+
|
1499 |
+
def func_7(mol, bits):
|
1500 |
+
""" *Internal Use Only*
|
1501 |
+
unsaturated non-aromatic heteroatom-containing
|
1502 |
+
"""
|
1503 |
+
ringSize = []
|
1504 |
+
AllRingsBond = mol.GetRingInfo().BondRings()
|
1505 |
+
temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
|
1506 |
+
for ring in AllRingsBond:
|
1507 |
+
unsaturated = False
|
1508 |
+
nonaromatic = True
|
1509 |
+
heteroatom = False
|
1510 |
+
######### unsaturated
|
1511 |
+
for bondIdx in ring:
|
1512 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
|
1513 |
+
unsaturated = True
|
1514 |
+
break
|
1515 |
+
######## non-aromatic
|
1516 |
+
for bondIdx in ring:
|
1517 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name == 'AROMATIC':
|
1518 |
+
nonaromatic = False
|
1519 |
+
break
|
1520 |
+
######## heteroatom-containing
|
1521 |
+
for bondIdx in ring:
|
1522 |
+
BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
|
1523 |
+
EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
|
1524 |
+
if BeginAtom.GetAtomicNum() not in [1, 6] or EndAtom.GetAtomicNum() not in [1, 6]:
|
1525 |
+
heteroatom = True
|
1526 |
+
break
|
1527 |
+
if unsaturated == True and nonaromatic == True and heteroatom == True:
|
1528 |
+
ringSize.append(len(ring))
|
1529 |
+
for k, v in temp.items():
|
1530 |
+
if len(ring) == k:
|
1531 |
+
temp[k] += 1
|
1532 |
+
if temp[3] >= 2:
|
1533 |
+
bits[6] = 1
|
1534 |
+
bits[13] = 1
|
1535 |
+
elif temp[3] == 1:
|
1536 |
+
bits[6] = 1
|
1537 |
+
else:
|
1538 |
+
pass
|
1539 |
+
if temp[4] >= 2:
|
1540 |
+
bits[20] = 1
|
1541 |
+
bits[27] = 1
|
1542 |
+
elif temp[4] == 1:
|
1543 |
+
bits[20] = 1
|
1544 |
+
else:
|
1545 |
+
pass
|
1546 |
+
if temp[5] >= 5:
|
1547 |
+
bits[34] = 1
|
1548 |
+
bits[41] = 1
|
1549 |
+
bits[48] = 1
|
1550 |
+
bits[55] = 1
|
1551 |
+
bits[62] = 1
|
1552 |
+
elif temp[5] == 4:
|
1553 |
+
bits[34] = 1
|
1554 |
+
bits[41] = 1
|
1555 |
+
bits[48] = 1
|
1556 |
+
bits[55] = 1
|
1557 |
+
elif temp[5] == 3:
|
1558 |
+
bits[34] = 1
|
1559 |
+
bits[41] = 1
|
1560 |
+
bits[48] = 1
|
1561 |
+
elif temp[5] == 2:
|
1562 |
+
bits[34] = 1
|
1563 |
+
bits[41] = 1
|
1564 |
+
elif temp[5] == 1:
|
1565 |
+
bits[34] = 1
|
1566 |
+
else:
|
1567 |
+
pass
|
1568 |
+
if temp[6] >= 5:
|
1569 |
+
bits[69] = 1
|
1570 |
+
bits[76] = 1
|
1571 |
+
bits[83] = 1
|
1572 |
+
bits[90] = 1
|
1573 |
+
bits[97] = 1
|
1574 |
+
elif temp[6] == 4:
|
1575 |
+
bits[69] = 1
|
1576 |
+
bits[76] = 1
|
1577 |
+
bits[83] = 1
|
1578 |
+
bits[90] = 1
|
1579 |
+
elif temp[6] == 3:
|
1580 |
+
bits[69] = 1
|
1581 |
+
bits[76] = 1
|
1582 |
+
bits[83] = 1
|
1583 |
+
elif temp[6] == 2:
|
1584 |
+
bits[69] = 1
|
1585 |
+
bits[76] = 1
|
1586 |
+
elif temp[6] == 1:
|
1587 |
+
bits[69] = 1
|
1588 |
+
else:
|
1589 |
+
pass
|
1590 |
+
if temp[7] >= 2:
|
1591 |
+
bits[104] = 1
|
1592 |
+
bits[111] = 1
|
1593 |
+
elif temp[7] == 1:
|
1594 |
+
bits[104] = 1
|
1595 |
+
else:
|
1596 |
+
pass
|
1597 |
+
if temp[8] >= 2:
|
1598 |
+
bits[118] = 1
|
1599 |
+
bits[125] = 1
|
1600 |
+
elif temp[8] == 1:
|
1601 |
+
bits[118] = 1
|
1602 |
+
else:
|
1603 |
+
pass
|
1604 |
+
if temp[9] >= 1:
|
1605 |
+
bits[132] = 1
|
1606 |
+
else:
|
1607 |
+
pass
|
1608 |
+
if temp[10] >= 1:
|
1609 |
+
bits[139] = 1
|
1610 |
+
else:
|
1611 |
+
pass
|
1612 |
+
return ringSize, bits
|
1613 |
+
|
1614 |
+
|
1615 |
+
def func_8(mol, bits):
|
1616 |
+
""" *Internal Use Only*
|
1617 |
+
aromatic rings or hetero-aromatic rings
|
1618 |
+
"""
|
1619 |
+
AllRingsBond = mol.GetRingInfo().BondRings()
|
1620 |
+
temp = {'aromatic': 0, 'heteroatom': 0}
|
1621 |
+
for ring in AllRingsBond:
|
1622 |
+
aromatic = True
|
1623 |
+
heteroatom = False
|
1624 |
+
for bondIdx in ring:
|
1625 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'AROMATIC':
|
1626 |
+
aromatic = False
|
1627 |
+
break
|
1628 |
+
if aromatic == True:
|
1629 |
+
temp['aromatic'] += 1
|
1630 |
+
for bondIdx in ring:
|
1631 |
+
BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
|
1632 |
+
EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
|
1633 |
+
if BeginAtom.GetAtomicNum() not in [1, 6] or EndAtom.GetAtomicNum() not in [1, 6]:
|
1634 |
+
heteroatom = True
|
1635 |
+
break
|
1636 |
+
if heteroatom == True:
|
1637 |
+
temp['heteroatom'] += 1
|
1638 |
+
if temp['aromatic'] >= 4:
|
1639 |
+
bits[140] = 1
|
1640 |
+
bits[142] = 1
|
1641 |
+
bits[144] = 1
|
1642 |
+
bits[146] = 1
|
1643 |
+
elif temp['aromatic'] == 3:
|
1644 |
+
bits[140] = 1
|
1645 |
+
bits[142] = 1
|
1646 |
+
bits[144] = 1
|
1647 |
+
elif temp['aromatic'] == 2:
|
1648 |
+
bits[140] = 1
|
1649 |
+
bits[142] = 1
|
1650 |
+
elif temp['aromatic'] == 1:
|
1651 |
+
bits[140] = 1
|
1652 |
+
else:
|
1653 |
+
pass
|
1654 |
+
if temp['aromatic'] >= 4 and temp['heteroatom'] >= 4:
|
1655 |
+
bits[141] = 1
|
1656 |
+
bits[143] = 1
|
1657 |
+
bits[145] = 1
|
1658 |
+
bits[147] = 1
|
1659 |
+
elif temp['aromatic'] == 3 and temp['heteroatom'] == 3:
|
1660 |
+
bits[141] = 1
|
1661 |
+
bits[143] = 1
|
1662 |
+
bits[145] = 1
|
1663 |
+
elif temp['aromatic'] == 2 and temp['heteroatom'] == 2:
|
1664 |
+
bits[141] = 1
|
1665 |
+
bits[143] = 1
|
1666 |
+
elif temp['aromatic'] == 1 and temp['heteroatom'] == 1:
|
1667 |
+
bits[141] = 1
|
1668 |
+
else:
|
1669 |
+
pass
|
1670 |
+
return bits
|
1671 |
+
|
1672 |
+
|
1673 |
+
def calcPubChemFingerPart2(mol): # 116-263
|
1674 |
+
""" *Internal Use Only*
|
1675 |
+
Calculate PubChem Fingerprints (116-263)
|
1676 |
+
"""
|
1677 |
+
bits = [0] * 148
|
1678 |
+
bits = func_1(mol, bits)[1]
|
1679 |
+
bits = func_2(mol, bits)[1]
|
1680 |
+
bits = func_3(mol, bits)[1]
|
1681 |
+
bits = func_4(mol, bits)[1]
|
1682 |
+
bits = func_5(mol, bits)[1]
|
1683 |
+
bits = func_6(mol, bits)[1]
|
1684 |
+
bits = func_7(mol, bits)[1]
|
1685 |
+
bits = func_8(mol, bits)
|
1686 |
+
|
1687 |
+
return bits
|
1688 |
+
|
1689 |
+
|
1690 |
+
def GetPubChemFPs(mol):
|
1691 |
+
"""*Internal Use Only*
|
1692 |
+
Calculate PubChem Fingerprints
|
1693 |
+
"""
|
1694 |
+
mol = Chem.AddHs(mol)
|
1695 |
+
AllBits = [0] * 881
|
1696 |
+
res1 = list(calcPubChemFingerPart1(mol).ToBitString())
|
1697 |
+
for index, item in enumerate(res1[1:116]):
|
1698 |
+
if item == '1':
|
1699 |
+
AllBits[index] = 1
|
1700 |
+
for index2, item2 in enumerate(res1[116:734]):
|
1701 |
+
if item2 == '1':
|
1702 |
+
AllBits[index2 + 115 + 148] = 1
|
1703 |
+
res2 = calcPubChemFingerPart2(mol)
|
1704 |
+
for index3, item3 in enumerate(res2):
|
1705 |
+
if item3 == 1:
|
1706 |
+
AllBits[index3 + 115] = 1
|
1707 |
+
AllBits = np.array(AllBits, dtype=np.bool_)
|
1708 |
+
|
1709 |
+
return AllBits
|
1710 |
+
|
1711 |
+
|
1712 |
+
# ------------------------------------
|
1713 |
+
|
1714 |
+
|
1715 |
+
file_path = os.path.dirname(__file__)
|
1716 |
+
|
1717 |
+
|
1718 |
+
def GetPubChemFPInfos():
|
1719 |
+
return pd.read_excel(os.path.join(file_path, 'pubchemfp.xlsx'))
|
1720 |
+
|
1721 |
+
|
1722 |
+
if __name__ == '__main__':
|
1723 |
+
print('-' * 10 + 'START' + '-' * 10)
|
1724 |
+
SMILES = 'C1=NC2NC3=CNCC3=CC2CC1'
|
1725 |
+
mol = Chem.MolFromSmiles(SMILES)
|
1726 |
+
mol2 = Chem.AddHs(mol)
|
1727 |
+
result = GetPubChemFPs(mol2)
|
1728 |
+
print('Molecule: %s' % SMILES)
|
1729 |
+
print('-' * 25)
|
1730 |
+
print('Results: %s' % result)
|
1731 |
+
print('-' * 10 + 'END' + '-' * 10)
|
deepscreen/data/featurizers/fingerprint/pubchemfp.xlsx
ADDED
Binary file (41.2 kB). View file
|
|
deepscreen/data/featurizers/fingerprint/rdkitfp.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
topological fingerprint
|
3 |
+
|
4 |
+
"""
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
from rdkit.Chem.rdmolops import RDKFingerprint
|
8 |
+
from rdkit.Chem import DataStructs
|
9 |
+
|
10 |
+
_type = 'topological-based'
|
11 |
+
|
12 |
+
|
13 |
+
def GetRDkitFPs(mol, nBits=2048, return_bitInfo=False):
|
14 |
+
"""
|
15 |
+
#################################################################
|
16 |
+
Calculate Daylight-like fingerprint or topological fingerprint
|
17 |
+
|
18 |
+
(1024 bits).
|
19 |
+
|
20 |
+
Usage:
|
21 |
+
|
22 |
+
result=CalculateDaylightFingerprint(mol)
|
23 |
+
|
24 |
+
Input: mol is a molecule object.
|
25 |
+
|
26 |
+
Output: result is a tuple form. The first is the number of
|
27 |
+
|
28 |
+
fingerprints. The second is a dict form whose keys are the
|
29 |
+
|
30 |
+
position which this molecule has some substructure. The third
|
31 |
+
|
32 |
+
is the DataStructs which is used for calculating the similarity.
|
33 |
+
#################################################################
|
34 |
+
"""
|
35 |
+
|
36 |
+
bitInfo = {}
|
37 |
+
fp = RDKFingerprint(mol, fpSize=nBits, bitInfo=bitInfo)
|
38 |
+
arr = np.zeros((0,), dtype=np.bool_)
|
39 |
+
DataStructs.ConvertToNumpyArray(fp, arr)
|
40 |
+
if return_bitInfo:
|
41 |
+
return arr, return_bitInfo
|
42 |
+
return arr
|
deepscreen/data/featurizers/fingerprint/smarts_maccskey.py
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
smartsPatts = {
|
2 |
+
'MACCSFP0': (None, 0),
|
3 |
+
# ignore, Bit 0 is a placeholder and should be ignored: https://github.com/rdkit/rdkit/issues/1726
|
4 |
+
'MACCSFP1': ('?', 0),
|
5 |
+
'MACCSFP2': ('[#104]', 0),
|
6 |
+
'MACCSFP3': ('[#32,#33,#34,#50,#51,#52,#82,#83,#84]', 0),
|
7 |
+
'MACCSFP4': ('[Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr]', 0),
|
8 |
+
'MACCSFP5': ('[Sc,Ti,Y,Zr,Hf]', 0),
|
9 |
+
'MACCSFP6': ('[La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu]', 0),
|
10 |
+
'MACCSFP7': ('[V,Cr,Mn,Nb,Mo,Tc,Ta,W,Re]', 0),
|
11 |
+
'MACCSFP8': ('[!#6;!#1]1~*~*~*~1', 0),
|
12 |
+
'MACCSFP9': ('[Fe,Co,Ni,Ru,Rh,Pd,Os,Ir,Pt]', 0),
|
13 |
+
'MACCSFP10': ('[Be,Mg,Ca,Sr,Ba,Ra]', 0),
|
14 |
+
'MACCSFP11': ('*1~*~*~*~1', 0),
|
15 |
+
'MACCSFP12': ('[Cu,Zn,Ag,Cd,Au,Hg]', 0),
|
16 |
+
'MACCSFP13': ('[#8]~[#7](~[#6])~[#6]', 0),
|
17 |
+
'MACCSFP14': ('[#16]-[#16]', 0),
|
18 |
+
'MACCSFP15': ('[#8]~[#6](~[#8])~[#8]', 0),
|
19 |
+
'MACCSFP16': ('[!#6;!#1]1~*~*~1', 0),
|
20 |
+
'MACCSFP17': ('[#6]#[#6]', 0),
|
21 |
+
'MACCSFP18': ('[#5,#13,#31,#49,#81]', 0),
|
22 |
+
'MACCSFP19': ('*1~*~*~*~*~*~*~1', 0),
|
23 |
+
'MACCSFP20': ('[#14]', 0),
|
24 |
+
'MACCSFP21': ('[#6]=[#6](~[!#6;!#1])~[!#6;!#1]', 0),
|
25 |
+
'MACCSFP22': ('*1~*~*~1', 0),
|
26 |
+
'MACCSFP23': ('[#7]~[#6](~[#8])~[#8]', 0),
|
27 |
+
'MACCSFP24': ('[#7]-[#8]', 0),
|
28 |
+
'MACCSFP25': ('[#7]~[#6](~[#7])~[#7]', 0),
|
29 |
+
'MACCSFP26': ('[#6]=;@[#6](@*)@*', 0),
|
30 |
+
'MACCSFP27': ('[I]', 0),
|
31 |
+
'MACCSFP28': ('[!#6;!#1]~[CH2]~[!#6;!#1]', 0),
|
32 |
+
'MACCSFP29': ('[#15]', 0),
|
33 |
+
'MACCSFP30': ('[#6]~[!#6;!#1](~[#6])(~[#6])~*', 0),
|
34 |
+
'MACCSFP31': ('[!#6;!#1]~[F,Cl,Br,I]', 0),
|
35 |
+
'MACCSFP32': ('[#6]~[#16]~[#7]', 0),
|
36 |
+
'MACCSFP33': ('[#7]~[#16]', 0),
|
37 |
+
'MACCSFP34': ('[CH2]=*', 0),
|
38 |
+
'MACCSFP35': ('[Li,Na,K,Rb,Cs,Fr]', 0),
|
39 |
+
'MACCSFP36': ('[#16R]', 0),
|
40 |
+
'MACCSFP37': ('[#7]~[#6](~[#8])~[#7]', 0),
|
41 |
+
'MACCSFP38': ('[#7]~[#6](~[#6])~[#7]', 0),
|
42 |
+
'MACCSFP39': ('[#8]~[#16](~[#8])~[#8]', 0),
|
43 |
+
'MACCSFP40': ('[#16]-[#8]', 0),
|
44 |
+
'MACCSFP41': ('[#6]#[#7]', 0),
|
45 |
+
'MACCSFP42': ('F', 0),
|
46 |
+
'MACCSFP43': ('[!#6;!#1;!H0]~*~[!#6;!#1;!H0]', 0),
|
47 |
+
'MACCSFP44': ('?', 0),
|
48 |
+
'MACCSFP45': ('[#6]=[#6]~[#7]', 0),
|
49 |
+
'MACCSFP46': ('Br', 0),
|
50 |
+
'MACCSFP47': ('[#16]~*~[#7]', 0),
|
51 |
+
'MACCSFP48': ('[#8]~[!#6;!#1](~[#8])(~[#8])', 0),
|
52 |
+
'MACCSFP49': ('[!+0]', 0),
|
53 |
+
'MACCSFP50': ('[#6]=[#6](~[#6])~[#6]', 0),
|
54 |
+
'MACCSFP51': ('[#6]~[#16]~[#8]', 0),
|
55 |
+
'MACCSFP52': ('[#7]~[#7]', 0),
|
56 |
+
'MACCSFP53': ('[!#6;!#1;!H0]~*~*~*~[!#6;!#1;!H0]', 0),
|
57 |
+
'MACCSFP54': ('[!#6;!#1;!H0]~*~*~[!#6;!#1;!H0]', 0),
|
58 |
+
'MACCSFP55': ('[#8]~[#16]~[#8]', 0),
|
59 |
+
'MACCSFP56': ('[#8]~[#7](~[#8])~[#6]', 0),
|
60 |
+
'MACCSFP57': ('[#8R]', 0),
|
61 |
+
'MACCSFP58': ('[!#6;!#1]~[#16]~[!#6;!#1]', 0),
|
62 |
+
'MACCSFP59': ('[#16]!:*:*', 0),
|
63 |
+
'MACCSFP60': ('[#16]=[#8]', 0),
|
64 |
+
'MACCSFP61': ('*~[#16](~*)~*', 0),
|
65 |
+
'MACCSFP62': ('*@*!@*@*', 0),
|
66 |
+
'MACCSFP63': ('[#7]=[#8]', 0),
|
67 |
+
'MACCSFP64': ('*@*!@[#16]', 0),
|
68 |
+
'MACCSFP65': ('c:n', 0),
|
69 |
+
'MACCSFP66': ('[#6]~[#6](~[#6])(~[#6])~*', 0),
|
70 |
+
'MACCSFP67': ('[!#6;!#1]~[#16]', 0),
|
71 |
+
'MACCSFP68': ('[!#6;!#1;!H0]~[!#6;!#1;!H0]', 0),
|
72 |
+
'MACCSFP69': ('[!#6;!#1]~[!#6;!#1;!H0]', 0),
|
73 |
+
'MACCSFP70': ('[!#6;!#1]~[#7]~[!#6;!#1]', 0),
|
74 |
+
'MACCSFP71': ('[#7]~[#8]', 0),
|
75 |
+
'MACCSFP72': ('[#8]~*~*~[#8]', 0),
|
76 |
+
'MACCSFP73': ('[#16]=*', 0),
|
77 |
+
'MACCSFP74': ('[CH3]~*~[CH3]', 0),
|
78 |
+
'MACCSFP75': ('*!@[#7]@*', 0),
|
79 |
+
'MACCSFP76': ('[#6]=[#6](~*)~*', 0),
|
80 |
+
'MACCSFP77': ('[#7]~*~[#7]', 0),
|
81 |
+
'MACCSFP78': ('[#6]=[#7]', 0),
|
82 |
+
'MACCSFP79': ('[#7]~*~*~[#7]', 0),
|
83 |
+
'MACCSFP80': ('[#7]~*~*~*~[#7]', 0),
|
84 |
+
'MACCSFP81': ('[#16]~*(~*)~*', 0),
|
85 |
+
'MACCSFP82': ('*~[CH2]~[!#6;!#1;!H0]', 0),
|
86 |
+
'MACCSFP83': ('[!#6;!#1]1~*~*~*~*~1', 0),
|
87 |
+
'MACCSFP84': ('[NH2]', 0),
|
88 |
+
'MACCSFP85': ('[#6]~[#7](~[#6])~[#6]', 0),
|
89 |
+
'MACCSFP86': ('[C;H2,H3][!#6;!#1][C;H2,H3]', 0),
|
90 |
+
'MACCSFP87': ('[F,Cl,Br,I]!@*@*', 0),
|
91 |
+
'MACCSFP88': ('[#16]', 0),
|
92 |
+
'MACCSFP89': ('[#8]~*~*~*~[#8]', 0),
|
93 |
+
'MACCSFP90': (
|
94 |
+
'[$([!#6;!#1;!H0]~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[CH2;R]1)]',
|
95 |
+
0),
|
96 |
+
'MACCSFP91': (
|
97 |
+
'[$([!#6;!#1;!H0]~*~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~*~[R]1@[R]@[CH2;R]1)]',
|
98 |
+
0),
|
99 |
+
'MACCSFP92': ('[#8]~[#6](~[#7])~[#6]', 0),
|
100 |
+
'MACCSFP93': ('[!#6;!#1]~[CH3]', 0),
|
101 |
+
'MACCSFP94': ('[!#6;!#1]~[#7]', 0),
|
102 |
+
'MACCSFP95': ('[#7]~*~*~[#8]', 0),
|
103 |
+
'MACCSFP96': ('*1~*~*~*~*~1', 0),
|
104 |
+
'MACCSFP97': ('[#7]~*~*~*~[#8]', 0),
|
105 |
+
'MACCSFP98': ('[!#6;!#1]1~*~*~*~*~*~1', 0),
|
106 |
+
'MACCSFP99': ('[#6]=[#6]', 0),
|
107 |
+
'MACCSFP100': ('*~[CH2]~[#7]', 0),
|
108 |
+
'MACCSFP101': (
|
109 |
+
'[$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1)]',
|
110 |
+
0),
|
111 |
+
'MACCSFP102': ('[!#6;!#1]~[#8]', 0),
|
112 |
+
'MACCSFP103': ('Cl', 0),
|
113 |
+
'MACCSFP104': ('[!#6;!#1;!H0]~*~[CH2]~*', 0),
|
114 |
+
'MACCSFP105': ('*@*(@*)@*', 0),
|
115 |
+
'MACCSFP106': ('[!#6;!#1]~*(~[!#6;!#1])~[!#6;!#1]', 0),
|
116 |
+
'MACCSFP107': ('[F,Cl,Br,I]~*(~*)~*', 0),
|
117 |
+
'MACCSFP108': ('[CH3]~*~*~*~[CH2]~*', 0),
|
118 |
+
'MACCSFP109': ('*~[CH2]~[#8]', 0),
|
119 |
+
'MACCSFP110': ('[#7]~[#6]~[#8]', 0),
|
120 |
+
'MACCSFP111': ('[#7]~*~[CH2]~*', 0),
|
121 |
+
'MACCSFP112': ('*~*(~*)(~*)~*', 0),
|
122 |
+
'MACCSFP113': ('[#8]!:*:*', 0),
|
123 |
+
'MACCSFP114': ('[CH3]~[CH2]~*', 0),
|
124 |
+
'MACCSFP115': ('[CH3]~*~[CH2]~*', 0),
|
125 |
+
'MACCSFP116': ('[$([CH3]~*~*~[CH2]~*),$([CH3]~*1~*~[CH2]1)]', 0),
|
126 |
+
'MACCSFP117': ('[#7]~*~[#8]', 0),
|
127 |
+
'MACCSFP118': ('[$(*~[CH2]~[CH2]~*),$(*1~[CH2]~[CH2]1)]', 1),
|
128 |
+
'MACCSFP119': ('[#7]=*', 0),
|
129 |
+
'MACCSFP120': ('[!#6;R]', 1),
|
130 |
+
'MACCSFP121': ('[#7;R]', 0),
|
131 |
+
'MACCSFP122': ('*~[#7](~*)~*', 0),
|
132 |
+
'MACCSFP123': ('[#8]~[#6]~[#8]', 0),
|
133 |
+
'MACCSFP124': ('[!#6;!#1]~[!#6;!#1]', 0),
|
134 |
+
'MACCSFP125': ('?', 0),
|
135 |
+
'MACCSFP126': ('*!@[#8]!@*', 0),
|
136 |
+
'MACCSFP127': ('*@*!@[#8]', 1),
|
137 |
+
'MACCSFP128': (
|
138 |
+
'[$(*~[CH2]~*~*~*~[CH2]~*),$([R]1@[CH2;R]@[R]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[R]@[CH2;R]1),$(*~[CH2]~*~[R]1@[R]@[CH2;R]1)]',
|
139 |
+
0),
|
140 |
+
'MACCSFP129': ('[$(*~[CH2]~*~*~[CH2]~*),$([R]1@[CH2]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[CH2;R]1)]',
|
141 |
+
0),
|
142 |
+
'MACCSFP130': ('[!#6;!#1]~[!#6;!#1]', 1),
|
143 |
+
'MACCSFP131': ('[!#6;!#1;!H0]', 1),
|
144 |
+
'MACCSFP132': ('[#8]~*~[CH2]~*', 0),
|
145 |
+
'MACCSFP133': ('*@*!@[#7]', 0),
|
146 |
+
'MACCSFP134': ('[F,Cl,Br,I]', 0),
|
147 |
+
'MACCSFP135': ('[#7]!:*:*', 0),
|
148 |
+
'MACCSFP136': ('[#8]=*', 1),
|
149 |
+
'MACCSFP137': ('[!C;!c;R]', 0),
|
150 |
+
'MACCSFP138': ('[!#6;!#1]~[CH2]~*', 1),
|
151 |
+
'MACCSFP139': ('[O;!H0]', 0),
|
152 |
+
'MACCSFP140': ('[#8]', 3),
|
153 |
+
'MACCSFP141': ('[CH3]', 2),
|
154 |
+
'MACCSFP142': ('[#7]', 1),
|
155 |
+
'MACCSFP143': ('*@*!@[#8]', 0),
|
156 |
+
'MACCSFP144': ('*!:*:*!:*', 0),
|
157 |
+
'MACCSFP145': ('*1~*~*~*~*~*~1', 1),
|
158 |
+
'MACCSFP146': ('[#8]', 2),
|
159 |
+
'MACCSFP147': ('[$(*~[CH2]~[CH2]~*),$([R]1@[CH2;R]@[CH2;R]1)]', 0),
|
160 |
+
'MACCSFP148': ('*~[!#6;!#1](~*)~*', 0),
|
161 |
+
'MACCSFP149': ('[C;H3,H4]', 1),
|
162 |
+
'MACCSFP150': ('*!@*@*!@*', 0),
|
163 |
+
'MACCSFP151': ('[#7;!H0]', 0),
|
164 |
+
'MACCSFP152': ('[#8]~[#6](~[#6])~[#6]', 0),
|
165 |
+
'MACCSFP153': ('[!#6;!#1]~[CH2]~*', 0),
|
166 |
+
'MACCSFP154': ('[#6]=[#8]', 0),
|
167 |
+
'MACCSFP155': ('*!@[CH2]!@*', 0),
|
168 |
+
'MACCSFP156': ('[#7]~*(~*)~*', 0),
|
169 |
+
'MACCSFP157': ('[#6]-[#8]', 0),
|
170 |
+
'MACCSFP158': ('[#6]-[#7]', 0),
|
171 |
+
'MACCSFP159': ('[#8]', 1),
|
172 |
+
'MACCSFP160': ('[C;H3,H4]', 0),
|
173 |
+
'MACCSFP161': ('[#7]', 0),
|
174 |
+
'MACCSFP162': ('a', 0),
|
175 |
+
'MACCSFP163': ('*1~*~*~*~*~*~1', 0),
|
176 |
+
'MACCSFP164': ('[#8]', 0),
|
177 |
+
'MACCSFP165': ('[R]', 0),
|
178 |
+
'MACCSFP166': ('?', 0)}
|
deepscreen/data/featurizers/fingerprint/smarts_pharmacophore.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Donor = ["[N;!H0;v3,v4&+1]", "[O,S;H1;+0]", "[n&H1&+0]"]
|
2 |
+
|
3 |
+
Acceptor = ["[O,S;H1;v2;!$(*-*=[O,N,P,S])]", "[O;H0;v2]", "[O,S;v1;-]",
|
4 |
+
"[N;v3;!$(N-*=[O,N,P,S])]", "[n&H0&+0]", "[o;+0;!$([o]:n);!$([o]:c:n)]"]
|
5 |
+
|
6 |
+
Positive = ["[#7;+]", "[N;H2&+0][$([C,a]);!$([C,a](=O))]",
|
7 |
+
"[N;H1&+0]([$([C,a]);!$([C,a](=O))])[$([C,a]);!$([C,a](=O))]",
|
8 |
+
"[N;H0&+0]([C;!$(C(=O))])([C;!$(C(=O))])[C;!$(C(=O))]"]
|
9 |
+
|
10 |
+
Negative = ["[C,S](=[O,S,P])-[O;H1,-1]"]
|
11 |
+
|
12 |
+
Hydrophobic = ["[C;D3,D4](-[CH3])-[CH3]", "[S;D2](-C)-C"]
|
13 |
+
|
14 |
+
Aromatic = ["a"]
|
15 |
+
|
16 |
+
pharmacophore_smarts = {"Donor": Donor,
|
17 |
+
"Acceptor": Acceptor,
|
18 |
+
"Positive": Positive,
|
19 |
+
"Negative": Negative,
|
20 |
+
"Hydrophobic": Hydrophobic,
|
21 |
+
"Aromatic": Aromatic}
|
deepscreen/data/featurizers/fingerprint/smarts_pubchem.py
ADDED
@@ -0,0 +1,734 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
smartsPatts = {
|
2 |
+
'PubChemFP0': ('[H]', 3),
|
3 |
+
'PubChemFP1': ('[H]', 7),
|
4 |
+
'PubChemFP2': ('[H]', 15),
|
5 |
+
'PubChemFP3': ('[H]', 31),
|
6 |
+
'PubChemFP4': ('[Li]', 0),
|
7 |
+
'PubChemFP5': ('[Li]', 1),
|
8 |
+
'PubChemFP6': ('[B]', 0),
|
9 |
+
'PubChemFP7': ('[B]', 1),
|
10 |
+
'PubChemFP8': ('[B]', 3),
|
11 |
+
'PubChemFP9': ('[C]', 1),
|
12 |
+
'PubChemFP10': ('[C]', 3),
|
13 |
+
'PubChemFP11': ('[C]', 7),
|
14 |
+
'PubChemFP12': ('[C]', 15),
|
15 |
+
'PubChemFP13': ('[C]', 31),
|
16 |
+
'PubChemFP14': ('[N]', 0),
|
17 |
+
'PubChemFP15': ('[N]', 1),
|
18 |
+
'PubChemFP16': ('[N]', 3),
|
19 |
+
'PubChemFP17': ('[N]', 7),
|
20 |
+
'PubChemFP18': ('[O]', 0),
|
21 |
+
'PubChemFP19': ('[O]', 1),
|
22 |
+
'PubChemFP20': ('[O]', 3),
|
23 |
+
'PubChemFP21': ('[O]', 7),
|
24 |
+
'PubChemFP22': ('[O]', 15),
|
25 |
+
'PubChemFP23': ('[F]', 0),
|
26 |
+
'PubChemFP24': ('[F]', 1),
|
27 |
+
'PubChemFP25': ('[F]', 3),
|
28 |
+
'PubChemFP26': ('[Na]', 0),
|
29 |
+
'PubChemFP27': ('[Na]', 1),
|
30 |
+
'PubChemFP28': ('[Si]', 0),
|
31 |
+
'PubChemFP29': ('[Si]', 1),
|
32 |
+
'PubChemFP30': ('[P]', 0),
|
33 |
+
'PubChemFP31': ('[P]', 1),
|
34 |
+
'PubChemFP32': ('[P]', 3),
|
35 |
+
'PubChemFP33': ('[S]', 0),
|
36 |
+
'PubChemFP34': ('[S]', 1),
|
37 |
+
'PubChemFP35': ('[S]', 3),
|
38 |
+
'PubChemFP36': ('[S]', 7),
|
39 |
+
'PubChemFP37': ('[Cl]', 0),
|
40 |
+
'PubChemFP38': ('[Cl]', 1),
|
41 |
+
'PubChemFP39': ('[Cl]', 3),
|
42 |
+
'PubChemFP40': ('[Cl]', 7),
|
43 |
+
'PubChemFP41': ('[K]', 0),
|
44 |
+
'PubChemFP42': ('[K]', 1),
|
45 |
+
'PubChemFP43': ('[Br]', 0),
|
46 |
+
'PubChemFP44': ('[Br]', 1),
|
47 |
+
'PubChemFP45': ('[Br]', 3),
|
48 |
+
'PubChemFP46': ('[I]', 0),
|
49 |
+
'PubChemFP47': ('[I]', 1),
|
50 |
+
'PubChemFP48': ('[I]', 3),
|
51 |
+
'PubChemFP49': ('[Be]', 0),
|
52 |
+
'PubChemFP50': ('[Mg]', 0),
|
53 |
+
'PubChemFP51': ('[Al]', 0),
|
54 |
+
'PubChemFP52': ('[Ca]', 0),
|
55 |
+
'PubChemFP53': ('[Sc]', 0),
|
56 |
+
'PubChemFP54': ('[Ti]', 0),
|
57 |
+
'PubChemFP55': ('[V]', 0),
|
58 |
+
'PubChemFP56': ('[Cr]', 0),
|
59 |
+
'PubChemFP57': ('[Mn]', 0),
|
60 |
+
'PubChemFP58': ('[Fe]', 0),
|
61 |
+
'PubChemFP59': ('[CO]', 0),
|
62 |
+
'PubChemFP60': ('[Ni]', 0),
|
63 |
+
'PubChemFP61': ('[Cu]', 0),
|
64 |
+
'PubChemFP62': ('[Zn]', 0),
|
65 |
+
'PubChemFP63': ('[Ga]', 0),
|
66 |
+
'PubChemFP64': ('[Ge]', 0),
|
67 |
+
'PubChemFP65': ('[As]', 0),
|
68 |
+
'PubChemFP66': ('[Se]', 0),
|
69 |
+
'PubChemFP67': ('[Kr]', 0),
|
70 |
+
'PubChemFP68': ('[Rb]', 0),
|
71 |
+
'PubChemFP69': ('[Sr]', 0),
|
72 |
+
'PubChemFP70': ('[Y]', 0),
|
73 |
+
'PubChemFP71': ('[Zr]', 0),
|
74 |
+
'PubChemFP72': ('[Nb]', 0),
|
75 |
+
'PubChemFP73': ('[Mo]', 0),
|
76 |
+
'PubChemFP74': ('[Ru]', 0),
|
77 |
+
'PubChemFP75': ('[Rh]', 0),
|
78 |
+
'PubChemFP76': ('[Pd]', 0),
|
79 |
+
'PubChemFP77': ('[Ag]', 0),
|
80 |
+
'PubChemFP78': ('[Cd]', 0),
|
81 |
+
'PubChemFP79': ('[In]', 0),
|
82 |
+
'PubChemFP80': ('[Sn]', 0),
|
83 |
+
'PubChemFP81': ('[Sb]', 0),
|
84 |
+
'PubChemFP82': ('[Te]', 0),
|
85 |
+
'PubChemFP83': ('[Xe]', 0),
|
86 |
+
'PubChemFP84': ('[Cs]', 0),
|
87 |
+
'PubChemFP85': ('[Ba]', 0),
|
88 |
+
'PubChemFP86': ('[Lu]', 0),
|
89 |
+
'PubChemFP87': ('[Hf]', 0),
|
90 |
+
'PubChemFP88': ('[Ta]', 0),
|
91 |
+
'PubChemFP89': ('[W]', 0),
|
92 |
+
'PubChemFP90': ('[Re]', 0),
|
93 |
+
'PubChemFP91': ('[Os]', 0),
|
94 |
+
'PubChemFP92': ('[Ir]', 0),
|
95 |
+
'PubChemFP93': ('[Pt]', 0),
|
96 |
+
'PubChemFP94': ('[Au]', 0),
|
97 |
+
'PubChemFP95': ('[Hg]', 0),
|
98 |
+
'PubChemFP96': ('[Tl]', 0),
|
99 |
+
'PubChemFP97': ('[Pb]', 0),
|
100 |
+
'PubChemFP98': ('[Bi]', 0),
|
101 |
+
'PubChemFP99': ('[La]', 0),
|
102 |
+
'PubChemFP100': ('[Ce]', 0),
|
103 |
+
'PubChemFP101': ('[Pr]', 0),
|
104 |
+
'PubChemFP102': ('[Nd]', 0),
|
105 |
+
'PubChemFP103': ('[Pm]', 0),
|
106 |
+
'PubChemFP104': ('[Sm]', 0),
|
107 |
+
'PubChemFP105': ('[Eu]', 0),
|
108 |
+
'PubChemFP106': ('[Gd]', 0),
|
109 |
+
'PubChemFP107': ('[Tb]', 0),
|
110 |
+
'PubChemFP108': ('[Dy]', 0),
|
111 |
+
'PubChemFP109': ('[Ho]', 0),
|
112 |
+
'PubChemFP110': ('[Er]', 0),
|
113 |
+
'PubChemFP111': ('[Tm]', 0),
|
114 |
+
'PubChemFP112': ('[Yb]', 0),
|
115 |
+
'PubChemFP113': ('[Tc]', 0),
|
116 |
+
'PubChemFP114': ('[U]', 0),
|
117 |
+
'PubChemFP263': ('[Li&!H0]', 0),
|
118 |
+
'PubChemFP264': ('[Li]~[Li]', 0),
|
119 |
+
'PubChemFP265': ('[Li]~[#5]', 0),
|
120 |
+
'PubChemFP266': ('[Li]~[#6]', 0),
|
121 |
+
'PubChemFP267': ('[Li]~[#8]', 0),
|
122 |
+
'PubChemFP268': ('[Li]~[F]', 0),
|
123 |
+
'PubChemFP269': ('[Li]~[#15]', 0),
|
124 |
+
'PubChemFP270': ('[Li]~[#16]', 0),
|
125 |
+
'PubChemFP271': ('[Li]~[Cl]', 0),
|
126 |
+
'PubChemFP272': ('[#5&!H0]', 0),
|
127 |
+
'PubChemFP273': ('[#5]~[#5]', 0),
|
128 |
+
'PubChemFP274': ('[#5]~[#6]', 0),
|
129 |
+
'PubChemFP275': ('[#5]~[#7]', 0),
|
130 |
+
'PubChemFP276': ('[#5]~[#8]', 0),
|
131 |
+
'PubChemFP277': ('[#5]~[F]', 0),
|
132 |
+
'PubChemFP278': ('[#5]~[#14]', 0),
|
133 |
+
'PubChemFP279': ('[#5]~[#15]', 0),
|
134 |
+
'PubChemFP280': ('[#5]~[#16]', 0),
|
135 |
+
'PubChemFP281': ('[#5]~[Cl]', 0),
|
136 |
+
'PubChemFP282': ('[#5]~[Br]', 0),
|
137 |
+
'PubChemFP283': ('[#6&!H0]', 0),
|
138 |
+
'PubChemFP284': ('[#6]~[#6]', 0),
|
139 |
+
'PubChemFP285': ('[#6]~[#7]', 0),
|
140 |
+
'PubChemFP286': ('[#6]~[#8]', 0),
|
141 |
+
'PubChemFP287': ('[#6]~[F]', 0),
|
142 |
+
'PubChemFP288': ('[#6]~[Na]', 0),
|
143 |
+
'PubChemFP289': ('[#6]~[Mg]', 0),
|
144 |
+
'PubChemFP290': ('[#6]~[Al]', 0),
|
145 |
+
'PubChemFP291': ('[#6]~[#14]', 0),
|
146 |
+
'PubChemFP292': ('[#6]~[#15]', 0),
|
147 |
+
'PubChemFP293': ('[#6]~[#16]', 0),
|
148 |
+
'PubChemFP294': ('[#6]~[Cl]', 0),
|
149 |
+
'PubChemFP295': ('[#6]~[#33]', 0),
|
150 |
+
'PubChemFP296': ('[#6]~[#34]', 0),
|
151 |
+
'PubChemFP297': ('[#6]~[Br]', 0),
|
152 |
+
'PubChemFP298': ('[#6]~[I]', 0),
|
153 |
+
'PubChemFP299': ('[#7&!H0]', 0),
|
154 |
+
'PubChemFP300': ('[#7]~[#7]', 0),
|
155 |
+
'PubChemFP301': ('[#7]~[#8]', 0),
|
156 |
+
'PubChemFP302': ('[#7]~[F]', 0),
|
157 |
+
'PubChemFP303': ('[#7]~[#14]', 0),
|
158 |
+
'PubChemFP304': ('[#7]~[#15]', 0),
|
159 |
+
'PubChemFP305': ('[#7]~[#16]', 0),
|
160 |
+
'PubChemFP306': ('[#7]~[Cl]', 0),
|
161 |
+
'PubChemFP307': ('[#7]~[Br]', 0),
|
162 |
+
'PubChemFP308': ('[#8&!H0]', 0),
|
163 |
+
'PubChemFP309': ('[#8]~[#8]', 0),
|
164 |
+
'PubChemFP310': ('[#8]~[Mg]', 0),
|
165 |
+
'PubChemFP311': ('[#8]~[Na]', 0),
|
166 |
+
'PubChemFP312': ('[#8]~[Al]', 0),
|
167 |
+
'PubChemFP313': ('[#8]~[#14]', 0),
|
168 |
+
'PubChemFP314': ('[#8]~[#15]', 0),
|
169 |
+
'PubChemFP315': ('[#8]~[K]', 0),
|
170 |
+
'PubChemFP316': ('[F]~[#15]', 0),
|
171 |
+
'PubChemFP317': ('[F]~[#16]', 0),
|
172 |
+
'PubChemFP318': ('[Al&!H0]', 0),
|
173 |
+
'PubChemFP319': ('[Al]~[Cl]', 0),
|
174 |
+
'PubChemFP320': ('[#14&!H0]', 0),
|
175 |
+
'PubChemFP321': ('[#14]~[#14]', 0),
|
176 |
+
'PubChemFP322': ('[#14]~[Cl]', 0),
|
177 |
+
'PubChemFP323': ('[#15&!H0]', 0),
|
178 |
+
'PubChemFP324': ('[#15]~[#15]', 0),
|
179 |
+
'PubChemFP325': ('[#33&!H0]', 0),
|
180 |
+
'PubChemFP326': ('[#33]~[#33]', 0),
|
181 |
+
'PubChemFP327': ('[#6](~Br)(~[#6])', 0),
|
182 |
+
'PubChemFP328': ('[#6](~Br)(~[#6])(~[#6])', 0),
|
183 |
+
'PubChemFP329': ('[#6&!H0]~[Br]', 0),
|
184 |
+
'PubChemFP330': ('[#6](~[Br])(:[c])', 0),
|
185 |
+
'PubChemFP331': ('[#6](~[Br])(:[n])', 0),
|
186 |
+
'PubChemFP332': ('[#6](~[#6])(~[#6])', 0),
|
187 |
+
'PubChemFP333': ('[#6](~[#6])(~[#6])(~[#6])', 0),
|
188 |
+
'PubChemFP334': ('[#6](~[#6])(~[#6])(~[#6])(~[#6])', 0),
|
189 |
+
'PubChemFP335': ('[#6H1](~[#6])(~[#6])(~[#6])', 0),
|
190 |
+
'PubChemFP336': ('[#6](~[#6])(~[#6])(~[#6])(~[#7])', 0),
|
191 |
+
'PubChemFP337': ('[#6](~[#6])(~[#6])(~[#6])(~[#8])', 0),
|
192 |
+
'PubChemFP338': ('[#6H1](~[#6])(~[#6])(~[#7])', 0),
|
193 |
+
'PubChemFP339': ('[#6H1](~[#6])(~[#6])(~[#8])', 0),
|
194 |
+
'PubChemFP340': ('[#6](~[#6])(~[#6])(~[#7])', 0),
|
195 |
+
'PubChemFP341': ('[#6](~[#6])(~[#6])(~[#8])', 0),
|
196 |
+
'PubChemFP342': ('[#6](~[#6])(~[Cl])', 0),
|
197 |
+
'PubChemFP343': ('[#6&!H0](~[#6])(~[Cl])', 0),
|
198 |
+
'PubChemFP344': ('[#6H,#6H2,#6H3,#6H4]~[#6]', 0),
|
199 |
+
'PubChemFP345': ('[#6&!H0](~[#6])(~[#7])', 0),
|
200 |
+
'PubChemFP346': ('[#6&!H0](~[#6])(~[#8])', 0),
|
201 |
+
'PubChemFP347': ('[#6H1](~[#6])(~[#8])(~[#8])', 0),
|
202 |
+
'PubChemFP348': ('[#6&!H0](~[#6])(~[#15])', 0),
|
203 |
+
'PubChemFP349': ('[#6&!H0](~[#6])(~[#16])', 0),
|
204 |
+
'PubChemFP350': ('[#6](~[#6])(~[I])', 0),
|
205 |
+
'PubChemFP351': ('[#6](~[#6])(~[#7])', 0),
|
206 |
+
'PubChemFP352': ('[#6](~[#6])(~[#8])', 0),
|
207 |
+
'PubChemFP353': ('[#6](~[#6])(~[#16])', 0),
|
208 |
+
'PubChemFP354': ('[#6](~[#6])(~[#14])', 0),
|
209 |
+
'PubChemFP355': ('[#6](~[#6])(:c)', 0),
|
210 |
+
'PubChemFP356': ('[#6](~[#6])(:c)(:c)', 0),
|
211 |
+
'PubChemFP357': ('[#6](~[#6])(:c)(:n)', 0),
|
212 |
+
'PubChemFP358': ('[#6](~[#6])(:n)', 0),
|
213 |
+
'PubChemFP359': ('[#6](~[#6])(:n)(:n)', 0),
|
214 |
+
'PubChemFP360': ('[#6](~[Cl])(~[Cl])', 0),
|
215 |
+
'PubChemFP361': ('[#6&!H0](~[Cl])', 0),
|
216 |
+
'PubChemFP362': ('[#6](~[Cl])(:c)', 0),
|
217 |
+
'PubChemFP363': ('[#6](~[F])(~[F])', 0),
|
218 |
+
'PubChemFP364': ('[#6](~[F])(:c)', 0),
|
219 |
+
'PubChemFP365': ('[#6&!H0](~[#7])', 0),
|
220 |
+
'PubChemFP366': ('[#6&!H0](~[#8])', 0),
|
221 |
+
'PubChemFP367': ('[#6&!H0](~[#8])(~[#8])', 0),
|
222 |
+
'PubChemFP368': ('[#6&!H0](~[#16])', 0),
|
223 |
+
'PubChemFP369': ('[#6&!H0](~[#14])', 0),
|
224 |
+
'PubChemFP370': ('[#6&!H0]:c', 0),
|
225 |
+
'PubChemFP371': ('[#6&!H0](:c)(:c)', 0),
|
226 |
+
'PubChemFP372': ('[#6&!H0](:c)(:n)', 0),
|
227 |
+
'PubChemFP373': ('[#6&!H0](:n)', 0),
|
228 |
+
'PubChemFP374': ('[#6H3]', 0),
|
229 |
+
'PubChemFP375': ('[#6](~[#7])(~[#7])', 0),
|
230 |
+
'PubChemFP376': ('[#6](~[#7])(:c)', 0),
|
231 |
+
'PubChemFP377': ('[#6](~[#7])(:c)(:c)', 0),
|
232 |
+
'PubChemFP378': ('[#6](~[#7])(:c)(:n)', 0),
|
233 |
+
'PubChemFP379': ('[#6](~[#7])(:n)', 0),
|
234 |
+
'PubChemFP380': ('[#6](~[#8])(~[#8])', 0),
|
235 |
+
'PubChemFP381': ('[#6](~[#8])(:c)', 0),
|
236 |
+
'PubChemFP382': ('[#6](~[#8])(:c)(:c)', 0),
|
237 |
+
'PubChemFP383': ('[#6](~[#16])(:c)', 0),
|
238 |
+
'PubChemFP384': ('[#6](:c)(:c)', 0),
|
239 |
+
'PubChemFP385': ('[#6](:c)(:c)(:c)', 0),
|
240 |
+
'PubChemFP386': ('[#6](:c)(:c)(:n)', 0),
|
241 |
+
'PubChemFP387': ('[#6](:c)(:n)', 0),
|
242 |
+
'PubChemFP388': ('[#6](:c)(:n)(:n)', 0),
|
243 |
+
'PubChemFP389': ('[#6](:n)(:n)', 0),
|
244 |
+
'PubChemFP390': ('[#7](~[#6])(~[#6])', 0),
|
245 |
+
'PubChemFP391': ('[#7](~[#6])(~[#6])(~[#6])', 0),
|
246 |
+
'PubChemFP392': ('[#7&!H0](~[#6])(~[#6])', 0),
|
247 |
+
'PubChemFP393': ('[#7&!H0](~[#6])', 0),
|
248 |
+
'PubChemFP394': ('[#7&!H0](~[#6])(~[#7])', 0),
|
249 |
+
'PubChemFP395': ('[#7](~[#6])(~[#8])', 0),
|
250 |
+
'PubChemFP396': ('[#7](~[#6])(:c)', 0),
|
251 |
+
'PubChemFP397': ('[#7](~[#6])(:c)(:c)', 0),
|
252 |
+
'PubChemFP398': ('[#7&!H0](~[#7])', 0),
|
253 |
+
'PubChemFP399': ('[#7&!H0](:c)', 0),
|
254 |
+
'PubChemFP400': ('[#7&!H0](:c)(:c)', 0),
|
255 |
+
'PubChemFP401': ('[#7](~[#8])(~[#8])', 0),
|
256 |
+
'PubChemFP402': ('[#7](~[#8])(:o)', 0),
|
257 |
+
'PubChemFP403': ('[#7](:c)(:c)', 0),
|
258 |
+
'PubChemFP404': ('[#7](:c)(:c)(:c)', 0),
|
259 |
+
'PubChemFP405': ('[#8](~[#6])(~[#6])', 0),
|
260 |
+
'PubChemFP406': ('[#8&!H0](~[#6])', 0),
|
261 |
+
'PubChemFP407': ('[#8](~[#6])(~[#15])', 0),
|
262 |
+
'PubChemFP408': ('[#8&!H0](~[#16])', 0),
|
263 |
+
'PubChemFP409': ('[#8](:c)(:c)', 0),
|
264 |
+
'PubChemFP410': ('[#15](~[#6])(~[#6])', 0),
|
265 |
+
'PubChemFP411': ('[#15](~[#8])(~[#8])', 0),
|
266 |
+
'PubChemFP412': ('[#16](~[#6])(~[#6])', 0),
|
267 |
+
'PubChemFP413': ('[#16&!H0](~[#6])', 0),
|
268 |
+
'PubChemFP414': ('[#16](~[#6])(~[#8])', 0),
|
269 |
+
'PubChemFP415': ('[#14](~[#6])(~[#6])', 0),
|
270 |
+
'PubChemFP416': ('[#6]=,:[#6]', 0),
|
271 |
+
'PubChemFP417': ('[#6]#[#6]', 0),
|
272 |
+
'PubChemFP418': ('[#6]=,:[#7]', 0),
|
273 |
+
'PubChemFP419': ('[#6]#[#7]', 0),
|
274 |
+
'PubChemFP420': ('[#6]=,:[#8]', 0),
|
275 |
+
'PubChemFP421': ('[#6]=,:[#16]', 0),
|
276 |
+
'PubChemFP422': ('[#7]=,:[#7]', 0),
|
277 |
+
'PubChemFP423': ('[#7]=,:[#8]', 0),
|
278 |
+
'PubChemFP424': ('[#7]=,:[#15]', 0),
|
279 |
+
'PubChemFP425': ('[#15]=,:[#8]', 0),
|
280 |
+
'PubChemFP426': ('[#15]=,:[#15]', 0),
|
281 |
+
'PubChemFP427': ('[#6](#[#6])(-,:[#6])', 0),
|
282 |
+
'PubChemFP428': ('[#6&!H0](#[#6])', 0),
|
283 |
+
'PubChemFP429': ('[#6](#[#7])(-,:[#6])', 0),
|
284 |
+
'PubChemFP430': ('[#6](-,:[#6])(-,:[#6])(=,:[#6])', 0),
|
285 |
+
'PubChemFP431': ('[#6](-,:[#6])(-,:[#6])(=,:[#7])', 0),
|
286 |
+
'PubChemFP432': ('[#6](-,:[#6])(-,:[#6])(=,:[#8])', 0),
|
287 |
+
'PubChemFP433': ('[#6](-,:[#6])([Cl])(=,:[#8])', 0),
|
288 |
+
'PubChemFP434': ('[#6&!H0](-,:[#6])(=,:[#6])', 0),
|
289 |
+
'PubChemFP435': ('[#6&!H0](-,:[#6])(=,:[#7])', 0),
|
290 |
+
'PubChemFP436': ('[#6&!H0](-,:[#6])(=,:[#8])', 0),
|
291 |
+
'PubChemFP437': ('[#6](-,:[#6])(-,:[#7])(=,:[#6])', 0),
|
292 |
+
'PubChemFP438': ('[#6](-,:[#6])(-,:[#7])(=,:[#7])', 0),
|
293 |
+
'PubChemFP439': ('[#6](-,:[#6])(-,:[#7])(=,:[#8])', 0),
|
294 |
+
'PubChemFP440': ('[#6](-,:[#6])(-,:[#8])(=,:[#8])', 0),
|
295 |
+
'PubChemFP441': ('[#6](-,:[#6])(=,:[#6])', 0),
|
296 |
+
'PubChemFP442': ('[#6](-,:[#6])(=,:[#7])', 0),
|
297 |
+
'PubChemFP443': ('[#6](-,:[#6])(=,:[#8])', 0),
|
298 |
+
'PubChemFP444': ('[#6]([Cl])(=,:[#8])', 0),
|
299 |
+
'PubChemFP445': ('[#6&!H0](-,:[#7])(=,:[#6])', 0),
|
300 |
+
'PubChemFP446': ('[#6&!H0](=,:[#6])', 0),
|
301 |
+
'PubChemFP447': ('[#6&!H0](=,:[#7])', 0),
|
302 |
+
'PubChemFP448': ('[#6&!H0](=,:[#8])', 0),
|
303 |
+
'PubChemFP449': ('[#6](-,:[#7])(=,:[#6])', 0),
|
304 |
+
'PubChemFP450': ('[#6](-,:[#7])(=,:[#7])', 0),
|
305 |
+
'PubChemFP451': ('[#6](-,:[#7])(=,:[#8])', 0),
|
306 |
+
'PubChemFP452': ('[#6](-,:[#8])(=,:[#8])', 0),
|
307 |
+
'PubChemFP453': ('[#7](-,:[#6])(=,:[#6])', 0),
|
308 |
+
'PubChemFP454': ('[#7](-,:[#6])(=,:[#8])', 0),
|
309 |
+
'PubChemFP455': ('[#7](-,:[#8])(=,:[#8])', 0),
|
310 |
+
'PubChemFP456': ('[#15](-,:[#8])(=,:[#8])', 0),
|
311 |
+
'PubChemFP457': ('[#16](-,:[#6])(=,:[#8])', 0),
|
312 |
+
'PubChemFP458': ('[#16](-,:[#8])(=,:[#8])', 0),
|
313 |
+
'PubChemFP459': ('[#16](=,:[#8])(=,:[#8])', 0),
|
314 |
+
'PubChemFP460': ('[#6]-,:[#6]-,:[#6]#[#6]', 0),
|
315 |
+
'PubChemFP461': ('[#8]-,:[#6]-,:[#6]=,:[#7]', 0),
|
316 |
+
'PubChemFP462': ('[#8]-,:[#6]-,:[#6]=,:[#8]', 0),
|
317 |
+
'PubChemFP463': ('[#7]:[#6]-,:[#16&!H0]', 0),
|
318 |
+
'PubChemFP464': ('[#7]-,:[#6]-,:[#6]=,:[#6]', 0),
|
319 |
+
'PubChemFP465': ('[#8]=,:[#16]-,:[#6]-,:[#6]', 0),
|
320 |
+
'PubChemFP466': ('[#7]#[#6]-,:[#6]=,:[#6]', 0),
|
321 |
+
'PubChemFP467': ('[#6]=,:[#7]-,:[#7]-,:[#6]', 0),
|
322 |
+
'PubChemFP468': ('[#8]=,:[#16]-,:[#6]-,:[#7]', 0),
|
323 |
+
'PubChemFP469': ('[#16]-,:[#16]-,:[#6]:[#6]', 0),
|
324 |
+
'PubChemFP470': ('[#6]:[#6]-,:[#6]=,:[#6]', 0),
|
325 |
+
'PubChemFP471': ('[#16]:[#6]:[#6]:[#6]', 0),
|
326 |
+
'PubChemFP472': ('[#6]:[#7]:[#6]-,:[#6]', 0),
|
327 |
+
'PubChemFP473': ('[#16]-,:[#6]:[#7]:[#6]', 0),
|
328 |
+
'PubChemFP474': ('[#16]:[#6]:[#6]:[#7]', 0),
|
329 |
+
'PubChemFP475': ('[#16]-,:[#6]=,:[#7]-,:[#6]', 0),
|
330 |
+
'PubChemFP476': ('[#6]-,:[#8]-,:[#6]=,:[#6]', 0),
|
331 |
+
'PubChemFP477': ('[#7]-,:[#7]-,:[#6]:[#6]', 0),
|
332 |
+
'PubChemFP478': ('[#16]-,:[#6]=,:[#7&!H0]', 0),
|
333 |
+
'PubChemFP479': ('[#16]-,:[#6]-,:[#16]-,:[#6]', 0),
|
334 |
+
'PubChemFP480': ('[#6]:[#16]:[#6]-,:[#6]', 0),
|
335 |
+
'PubChemFP481': ('[#8]-,:[#16]-,:[#6]:[#6]', 0),
|
336 |
+
'PubChemFP482': ('[#6]:[#7]-,:[#6]:[#6]', 0),
|
337 |
+
'PubChemFP483': ('[#7]-,:[#16]-,:[#6]:[#6]', 0),
|
338 |
+
'PubChemFP484': ('[#7]-,:[#6]:[#7]:[#6]', 0),
|
339 |
+
'PubChemFP485': ('[#7]:[#6]:[#6]:[#7]', 0),
|
340 |
+
'PubChemFP486': ('[#7]-,:[#6]:[#7]:[#7]', 0),
|
341 |
+
'PubChemFP487': ('[#7]-,:[#6]=,:[#7]-,:[#6]', 0),
|
342 |
+
'PubChemFP488': ('[#7]-,:[#6]=,:[#7&!H0]', 0),
|
343 |
+
'PubChemFP489': ('[#7]-,:[#6]-,:[#16]-,:[#6]', 0),
|
344 |
+
'PubChemFP490': ('[#6]-,:[#6]-,:[#6]=,:[#6]', 0),
|
345 |
+
'PubChemFP491': ('[#6]-,:[#7]:[#6&!H0]', 0),
|
346 |
+
'PubChemFP492': ('[#7]-,:[#6]:[#8]:[#6]', 0),
|
347 |
+
'PubChemFP493': ('[#8]=,:[#6]-,:[#6]:[#6]', 0),
|
348 |
+
'PubChemFP494': ('[#8]=,:[#6]-,:[#6]:[#7]', 0),
|
349 |
+
'PubChemFP495': ('[#6]-,:[#7]-,:[#6]:[#6]', 0),
|
350 |
+
'PubChemFP496': ('[#7]:[#7]-,:[#6&!H0]', 0),
|
351 |
+
'PubChemFP497': ('[#8]-,:[#6]:[#6]:[#7]', 0),
|
352 |
+
'PubChemFP498': ('[#8]-,:[#6]=,:[#6]-,:[#6]', 0),
|
353 |
+
'PubChemFP499': ('[#7]-,:[#6]:[#6]:[#7]', 0),
|
354 |
+
'PubChemFP500': ('[#6]-,:[#16]-,:[#6]:[#6]', 0),
|
355 |
+
'PubChemFP501': ('[Cl]-,:[#6]:[#6]-,:[#6]', 0),
|
356 |
+
'PubChemFP502': ('[#7]-,:[#6]=,:[#6&!H0]', 0),
|
357 |
+
'PubChemFP503': ('[Cl]-,:[#6]:[#6&!H0]', 0),
|
358 |
+
'PubChemFP504': ('[#7]:[#6]:[#7]-,:[#6]', 0),
|
359 |
+
'PubChemFP505': ('[Cl]-,:[#6]:[#6]-,:[#8]', 0),
|
360 |
+
'PubChemFP506': ('[#6]-,:[#6]:[#7]:[#6]', 0),
|
361 |
+
'PubChemFP507': ('[#6]-,:[#6]-,:[#16]-,:[#6]', 0),
|
362 |
+
'PubChemFP508': ('[#16]=,:[#6]-,:[#7]-,:[#6]', 0),
|
363 |
+
'PubChemFP509': ('[Br]-,:[#6]:[#6]-,:[#6]', 0),
|
364 |
+
'PubChemFP510': ('[#7&!H0]-,:[#7&!H0]', 0),
|
365 |
+
'PubChemFP511': ('[#16]=,:[#6]-,:[#7&!H0]', 0),
|
366 |
+
'PubChemFP512': ('[#6]-,:[#33]-[#8&!H0]', 0),
|
367 |
+
'PubChemFP513': ('[#16]:[#6]:[#6&!H0]', 0),
|
368 |
+
'PubChemFP514': ('[#8]-,:[#7]-,:[#6]-,:[#6]', 0),
|
369 |
+
'PubChemFP515': ('[#7]-,:[#7]-,:[#6]-,:[#6]', 0),
|
370 |
+
'PubChemFP516': ('[#6H,#6H2,#6H3]=,:[#6H,#6H2,#6H3]', 0),
|
371 |
+
'PubChemFP517': ('[#7]-,:[#7]-,:[#6]-,:[#7]', 0),
|
372 |
+
'PubChemFP518': ('[#8]=,:[#6]-,:[#7]-,:[#7]', 0),
|
373 |
+
'PubChemFP519': ('[#7]=,:[#6]-,:[#7]-,:[#6]', 0),
|
374 |
+
'PubChemFP520': ('[#6]=,:[#6]-,:[#6]:[#6]', 0),
|
375 |
+
'PubChemFP521': ('[#6]:[#7]-,:[#6&!H0]', 0),
|
376 |
+
'PubChemFP522': ('[#6]-,:[#7]-,:[#7&!H0]', 0),
|
377 |
+
'PubChemFP523': ('[#7]:[#6]:[#6]-,:[#6]', 0),
|
378 |
+
'PubChemFP524': ('[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
|
379 |
+
'PubChemFP525': ('[#33]-,:[#6]:[#6&!H0]', 0),
|
380 |
+
'PubChemFP526': ('[Cl]-,:[#6]:[#6]-,:[Cl]', 0),
|
381 |
+
'PubChemFP527': ('[#6]:[#6]:[#7&!H0]', 0),
|
382 |
+
'PubChemFP528': ('[#7&!H0]-,:[#6&!H0]', 0),
|
383 |
+
'PubChemFP529': ('[Cl]-,:[#6]-,:[#6]-,:[Cl]', 0),
|
384 |
+
'PubChemFP530': ('[#7]:[#6]-,:[#6]:[#6]', 0),
|
385 |
+
'PubChemFP531': ('[#16]-,:[#6]:[#6]-,:[#6]', 0),
|
386 |
+
'PubChemFP532': ('[#16]-,:[#6]:[#6&!H0]', 0),
|
387 |
+
'PubChemFP533': ('[#16]-,:[#6]:[#6]-,:[#7]', 0),
|
388 |
+
'PubChemFP534': ('[#16]-,:[#6]:[#6]-,:[#8]', 0),
|
389 |
+
'PubChemFP535': ('[#8]=,:[#6]-,:[#6]-,:[#6]', 0),
|
390 |
+
'PubChemFP536': ('[#8]=,:[#6]-,:[#6]-,:[#7]', 0),
|
391 |
+
'PubChemFP537': ('[#8]=,:[#6]-,:[#6]-,:[#8]', 0),
|
392 |
+
'PubChemFP538': ('[#7]=,:[#6]-,:[#6]-,:[#6]', 0),
|
393 |
+
'PubChemFP539': ('[#7]=,:[#6]-,:[#6&!H0]', 0),
|
394 |
+
'PubChemFP540': ('[#6]-,:[#7]-,:[#6&!H0]', 0),
|
395 |
+
'PubChemFP541': ('[#8]-,:[#6]:[#6]-,:[#6]', 0),
|
396 |
+
'PubChemFP542': ('[#8]-,:[#6]:[#6&!H0]', 0),
|
397 |
+
'PubChemFP543': ('[#8]-,:[#6]:[#6]-,:[#7]', 0),
|
398 |
+
'PubChemFP544': ('[#8]-,:[#6]:[#6]-,:[#8]', 0),
|
399 |
+
'PubChemFP545': ('[#7]-,:[#6]:[#6]-,:[#6]', 0),
|
400 |
+
'PubChemFP546': ('[#7]-,:[#6]:[#6&!H0]', 0),
|
401 |
+
'PubChemFP547': ('[#7]-,:[#6]:[#6]-,:[#7]', 0),
|
402 |
+
'PubChemFP548': ('[#8]-,:[#6]-,:[#6]:[#6]', 0),
|
403 |
+
'PubChemFP549': ('[#7]-,:[#6]-,:[#6]:[#6]', 0),
|
404 |
+
'PubChemFP550': ('[Cl]-,:[#6]-,:[#6]-,:[#6]', 0),
|
405 |
+
'PubChemFP551': ('[Cl]-,:[#6]-,:[#6]-,:[#8]', 0),
|
406 |
+
'PubChemFP552': ('[#6]:[#6]-,:[#6]:[#6]', 0),
|
407 |
+
'PubChemFP553': ('[#8]=,:[#6]-,:[#6]=,:[#6]', 0),
|
408 |
+
'PubChemFP554': ('[Br]-,:[#6]-,:[#6]-,:[#6]', 0),
|
409 |
+
'PubChemFP555': ('[#7]=,:[#6]-,:[#6]=,:[#6]', 0),
|
410 |
+
'PubChemFP556': ('[#6]=,:[#6]-,:[#6]-,:[#6]', 0),
|
411 |
+
'PubChemFP557': ('[#7]:[#6]-,:[#8&!H0]', 0),
|
412 |
+
'PubChemFP558': ('[#8]=,:[#7]-,:c:c', 0),
|
413 |
+
'PubChemFP559': ('[#8]-,:[#6]-,:[#7&!H0]', 0),
|
414 |
+
'PubChemFP560': ('[#7]-,:[#6]-,:[#7]-,:[#6]', 0),
|
415 |
+
'PubChemFP561': ('[Cl]-,:[#6]-,:[#6]=,:[#8]', 0),
|
416 |
+
'PubChemFP562': ('[Br]-,:[#6]-,:[#6]=,:[#8]', 0),
|
417 |
+
'PubChemFP563': ('[#8]-,:[#6]-,:[#8]-,:[#6]', 0),
|
418 |
+
'PubChemFP564': ('[#6]=,:[#6]-,:[#6]=,:[#6]', 0),
|
419 |
+
'PubChemFP565': ('[#6]:[#6]-,:[#8]-,:[#6]', 0),
|
420 |
+
'PubChemFP566': ('[#8]-,:[#6]-,:[#6]-,:[#7]', 0),
|
421 |
+
'PubChemFP567': ('[#8]-,:[#6]-,:[#6]-,:[#8]', 0),
|
422 |
+
'PubChemFP568': ('N#[#6]-,:[#6]-,:[#6]', 0),
|
423 |
+
'PubChemFP569': ('[#7]-,:[#6]-,:[#6]-,:[#7]', 0),
|
424 |
+
'PubChemFP570': ('[#6]:[#6]-,:[#6]-,:[#6]', 0),
|
425 |
+
'PubChemFP571': ('[#6&!H0]-,:[#8&!H0]', 0),
|
426 |
+
'PubChemFP572': ('n:c:n:c', 0),
|
427 |
+
'PubChemFP573': ('[#8]-,:[#6]-,:[#6]=,:[#6]', 0),
|
428 |
+
'PubChemFP574': ('[#8]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
|
429 |
+
'PubChemFP575': ('[#8]-,:[#6]-,:[#6]:[#6]-,:[#8]', 0),
|
430 |
+
'PubChemFP576': ('[#7]=,:[#6]-,:[#6]:[#6&!H0]', 0),
|
431 |
+
'PubChemFP577': ('c:c-,:[#7]-,:c:c', 0),
|
432 |
+
'PubChemFP578': ('[#6]-,:[#6]:[#6]-,:c:c', 0),
|
433 |
+
'PubChemFP579': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
434 |
+
'PubChemFP580': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
435 |
+
'PubChemFP581': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
436 |
+
'PubChemFP582': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
437 |
+
'PubChemFP583': ('[Cl]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
|
438 |
+
'PubChemFP584': ('c:c-,:[#6]=,:[#6]-,:[#6]', 0),
|
439 |
+
'PubChemFP585': ('[#6]-,:[#6]:[#6]-,:[#7]-,:[#6]', 0),
|
440 |
+
'PubChemFP586': ('[#6]-,:[#16]-,:[#6]-,:[#6]-,:[#6]', 0),
|
441 |
+
'PubChemFP587': ('[#7]-,:[#6]:[#6]-,:[#8&!H0]', 0),
|
442 |
+
'PubChemFP588': ('[#8]=,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
|
443 |
+
'PubChemFP589': ('[#6]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
|
444 |
+
'PubChemFP590': ('[#6]-,:[#6]:[#6]-,:[#8&!H0]', 0),
|
445 |
+
'PubChemFP591': ('[Cl]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
446 |
+
'PubChemFP592': ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
447 |
+
'PubChemFP593': ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
448 |
+
'PubChemFP594': ('[#6]-,:[#8]-,:[#6]-,:[#6]=,:[#6]', 0),
|
449 |
+
'PubChemFP595': ('c:c-,:[#6]-,:[#6]-,:[#6]', 0),
|
450 |
+
'PubChemFP596': ('[#7]=,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
|
451 |
+
'PubChemFP597': ('[#8]=,:[#6]-,:[#6]-,:c:c', 0),
|
452 |
+
'PubChemFP598': ('[Cl]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
|
453 |
+
'PubChemFP599': ('[#6H,#6H2,#6H3]-,:[#6]=,:[#6H,#6H2,#6H3]', 0),
|
454 |
+
'PubChemFP600': ('[#7]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
|
455 |
+
'PubChemFP601': ('[#7]-,:[#6]:[#6]:[#6]-,:[#7]', 0),
|
456 |
+
'PubChemFP602': ('[#8]=,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
457 |
+
'PubChemFP603': ('[#6]-,:c:c:[#6]-,:[#6]', 0),
|
458 |
+
'PubChemFP604': ('[#6]-,:[#8]-,:[#6]-,:[#6]:c', 0),
|
459 |
+
'PubChemFP605': ('[#8]=,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
460 |
+
'PubChemFP606': ('[#8]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
|
461 |
+
'PubChemFP607': ('[#7]-,:[#6]-,:[#6]-,:[#6]:c', 0),
|
462 |
+
'PubChemFP608': ('[#6]-,:[#6]-,:[#6]-,:[#6]:c', 0),
|
463 |
+
'PubChemFP609': ('[Cl]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
464 |
+
'PubChemFP610': ('[#6]-,:[#8]-,:[#6]-,:[#8]-,:[#6]', 0),
|
465 |
+
'PubChemFP611': ('[#7]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
466 |
+
'PubChemFP612': ('[#7]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
|
467 |
+
'PubChemFP613': ('[#6]-,:[#7]-,:[#6]-,:[#6]-,:[#6]', 0),
|
468 |
+
'PubChemFP614': ('[#6]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
|
469 |
+
'PubChemFP615': ('[#7]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
470 |
+
'PubChemFP616': ('c:c:n:n:c', 0),
|
471 |
+
'PubChemFP617': ('[#6]-,:[#6]-,:[#6]-,:[#8&!H0]', 0),
|
472 |
+
'PubChemFP618': ('c:[#6]-,:[#6]-,:[#6]:c', 0),
|
473 |
+
'PubChemFP619': ('[#8]-,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
|
474 |
+
'PubChemFP620': ('c:c-,:[#8]-,:[#6]-,:[#6]', 0),
|
475 |
+
'PubChemFP621': ('[#7]-,:[#6]:c:c:n', 0),
|
476 |
+
'PubChemFP622': ('[#8]=,:[#6]-,:[#8]-,:[#6]:c', 0),
|
477 |
+
'PubChemFP623': ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
|
478 |
+
'PubChemFP624': ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#7]', 0),
|
479 |
+
'PubChemFP625': ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#8]', 0),
|
480 |
+
'PubChemFP626': ('[#6]-,:[#8]-,:[#6]:[#6]-,:[#6]', 0),
|
481 |
+
'PubChemFP627': ('[#8]=,:[#33]-,:[#6]:c:c', 0),
|
482 |
+
'PubChemFP628': ('[#6]-,:[#7]-,:[#6]-,:[#6]:c', 0),
|
483 |
+
'PubChemFP629': ('[#16]-,:[#6]:c:c-,:[#7]', 0),
|
484 |
+
'PubChemFP630': ('[#8]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
|
485 |
+
'PubChemFP631': ('[#8]-,:[#6]:[#6]-,:[#8&!H0]', 0),
|
486 |
+
'PubChemFP632': ('[#6]-,:[#6]-,:[#8]-,:[#6]:c', 0),
|
487 |
+
'PubChemFP633': ('[#7]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
|
488 |
+
'PubChemFP634': ('[#6]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
|
489 |
+
'PubChemFP635': ('[#7]-,:[#7]-,:[#6]-,:[#7&!H0]', 0),
|
490 |
+
'PubChemFP636': ('[#6]-,:[#7]-,:[#6]-,:[#7]-,:[#6]', 0),
|
491 |
+
'PubChemFP637': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
492 |
+
'PubChemFP638': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
493 |
+
'PubChemFP639': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
494 |
+
'PubChemFP640': ('[#6]=,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
495 |
+
'PubChemFP641': ('[#8]-,:[#6]-,:[#6]-,:[#6]=,:[#6]', 0),
|
496 |
+
'PubChemFP642': ('[#8]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
|
497 |
+
'PubChemFP643': ('[#6&!H0]-,:[#6]-,:[#7&!H0]', 0),
|
498 |
+
'PubChemFP644': ('[#6]-,:[#6]=,:[#7]-,:[#7]-,:[#6]', 0),
|
499 |
+
'PubChemFP645': ('[#8]=,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
|
500 |
+
'PubChemFP646': ('[#8]=,:[#6]-,:[#7]-,:[#6&!H0]', 0),
|
501 |
+
'PubChemFP647': ('[#8]=,:[#6]-,:[#7]-,:[#6]-,:[#7]', 0),
|
502 |
+
'PubChemFP648': ('[#8]=,:[#7]-,:[#6]:[#6]-,:[#7]', 0),
|
503 |
+
'PubChemFP649': ('[#8]=,:[#7]-,:c:c-,:[#8]', 0),
|
504 |
+
'PubChemFP650': ('[#8]=,:[#6]-,:[#7]-,:[#6]=,:[#8]', 0),
|
505 |
+
'PubChemFP651': ('[#8]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
|
506 |
+
'PubChemFP652': ('[#8]-,:[#6]:[#6]:[#6]-,:[#7]', 0),
|
507 |
+
'PubChemFP653': ('[#8]-,:[#6]:[#6]:[#6]-,:[#8]', 0),
|
508 |
+
'PubChemFP654': ('[#7]-,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
|
509 |
+
'PubChemFP655': ('[#8]-,:[#6]-,:[#6]-,:[#6]:c', 0),
|
510 |
+
'PubChemFP656': ('[#6]-,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
|
511 |
+
'PubChemFP657': ('[#6]-,:[#7]-,:[#6]:[#6]-,:[#6]', 0),
|
512 |
+
'PubChemFP658': ('[#6]-,:[#6]-,:[#16]-,:[#6]-,:[#6]', 0),
|
513 |
+
'PubChemFP659': ('[#8]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
514 |
+
'PubChemFP660': ('[#6]-,:[#6]=,:[#6]-,:[#6]-,:[#6]', 0),
|
515 |
+
'PubChemFP661': ('[#8]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
|
516 |
+
'PubChemFP662': ('[#8]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
517 |
+
'PubChemFP663': ('[#8]-,:[#6]-,:[#6]-,:[#8&!H0]', 0),
|
518 |
+
'PubChemFP664': ('[#6]-,:[#6]=,:[#6]-,:[#6]=,:[#6]', 0),
|
519 |
+
'PubChemFP665': ('[#7]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
|
520 |
+
'PubChemFP666': ('[#6]=,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
521 |
+
'PubChemFP667': ('[#6]=,:[#6]-,:[#6]-,:[#8&!H0]', 0),
|
522 |
+
'PubChemFP668': ('[#6]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
|
523 |
+
'PubChemFP669': ('[Cl]-,:[#6]:[#6]-,:[#6]=,:[#8]', 0),
|
524 |
+
'PubChemFP670': ('[Br]-,:[#6]:c:c-,:[#6]', 0),
|
525 |
+
'PubChemFP671': ('[#8]=,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
|
526 |
+
'PubChemFP672': ('[#8]=,:[#6]-,:[#6]=,:[#6&!H0]', 0),
|
527 |
+
'PubChemFP673': ('[#8]=,:[#6]-,:[#6]=,:[#6]-,:[#7]', 0),
|
528 |
+
'PubChemFP674': ('[#7]-,:[#6]-,:[#7]-,:[#6]:c', 0),
|
529 |
+
'PubChemFP675': ('[Br]-,:[#6]-,:[#6]-,:[#6]:c', 0),
|
530 |
+
'PubChemFP676': ('[#7]#[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
531 |
+
'PubChemFP677': ('[#6]-,:[#6]=,:[#6]-,:[#6]:c', 0),
|
532 |
+
'PubChemFP678': ('[#6]-,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
|
533 |
+
'PubChemFP679': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
534 |
+
'PubChemFP680': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
535 |
+
'PubChemFP681': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
536 |
+
'PubChemFP682': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
537 |
+
'PubChemFP683': ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
538 |
+
'PubChemFP684': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
539 |
+
'PubChemFP685': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
540 |
+
'PubChemFP686': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
541 |
+
'PubChemFP687': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
|
542 |
+
'PubChemFP688': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
543 |
+
'PubChemFP689': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
544 |
+
'PubChemFP690': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
545 |
+
'PubChemFP691': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
546 |
+
'PubChemFP692': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
547 |
+
'PubChemFP693': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
548 |
+
'PubChemFP694': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
|
549 |
+
'PubChemFP695': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
550 |
+
'PubChemFP696': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
551 |
+
'PubChemFP697': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]', 0),
|
552 |
+
'PubChemFP698': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
553 |
+
'PubChemFP699': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]', 0),
|
554 |
+
'PubChemFP700': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
555 |
+
'PubChemFP701': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#8])-,:[#6]', 0),
|
556 |
+
'PubChemFP702': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
557 |
+
'PubChemFP703': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#7])-,:[#6]', 0),
|
558 |
+
'PubChemFP704': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
559 |
+
'PubChemFP705': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#8])-,:[#6]', 0),
|
560 |
+
'PubChemFP706': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](=,:[#8])-,:[#6]', 0),
|
561 |
+
'PubChemFP707': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#7])-,:[#6]', 0),
|
562 |
+
'PubChemFP708': ('[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]', 0),
|
563 |
+
'PubChemFP709': ('[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]-,:[#6]', 0),
|
564 |
+
'PubChemFP710': ('[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]', 0),
|
565 |
+
'PubChemFP711': ('[#6]-,:[#6](-,:[#6])(-,:[#6])-,:[#6]-,:[#6]', 0),
|
566 |
+
'PubChemFP712': ('[#6]-,:[#6](-,:[#6])-,:[#6](-,:[#6])-,:[#6]', 0),
|
567 |
+
'PubChemFP713': ('[#6]c1ccc([#6])cc1', 0),
|
568 |
+
'PubChemFP714': ('[#6]c1ccc([#8])cc1', 0),
|
569 |
+
'PubChemFP715': ('[#6]c1ccc([#16])cc1', 0),
|
570 |
+
'PubChemFP716': ('[#6]c1ccc([#7])cc1', 0),
|
571 |
+
'PubChemFP717': ('[#6]c1ccc(Cl)cc1', 0),
|
572 |
+
'PubChemFP718': ('[#6]c1ccc(Br)cc1', 0),
|
573 |
+
'PubChemFP719': ('[#8]c1ccc([#8])cc1', 0),
|
574 |
+
'PubChemFP720': ('[#8]c1ccc([#16])cc1', 0),
|
575 |
+
'PubChemFP721': ('[#8]c1ccc([#7])cc1', 0),
|
576 |
+
'PubChemFP722': ('[#8]c1ccc(Cl)cc1', 0),
|
577 |
+
'PubChemFP723': ('[#8]c1ccc(Br)cc1', 0),
|
578 |
+
'PubChemFP724': ('[#16]c1ccc([#16])cc1', 0),
|
579 |
+
'PubChemFP725': ('[#16]c1ccc([#7])cc1', 0),
|
580 |
+
'PubChemFP726': ('[#16]c1ccc(Cl)cc1', 0),
|
581 |
+
'PubChemFP727': ('[#16]c1ccc(Br)cc1', 0),
|
582 |
+
'PubChemFP728': ('[#7]c1ccc([#7])cc1', 0),
|
583 |
+
'PubChemFP729': ('[#7]c1ccc(Cl)cc1', 0),
|
584 |
+
'PubChemFP730': ('[#7]c1ccc(Br)cc1', 0),
|
585 |
+
'PubChemFP731': ('Clc1ccc(Cl)cc1', 0),
|
586 |
+
'PubChemFP732': ('Clc1ccc(Br)cc1', 0),
|
587 |
+
'PubChemFP733': ('Brc1ccc(Br)cc1', 0),
|
588 |
+
'PubChemFP734': ('[#6]c1cc([#6])ccc1', 0),
|
589 |
+
'PubChemFP735': ('[#6]c1cc([#8])ccc1', 0),
|
590 |
+
'PubChemFP736': ('[#6]c1cc([#16])ccc1', 0),
|
591 |
+
'PubChemFP737': ('[#6]c1cc([#7])ccc1', 0),
|
592 |
+
'PubChemFP738': ('[#6]c1cc(Cl)ccc1', 0),
|
593 |
+
'PubChemFP739': ('[#6]c1cc(Br)ccc1', 0),
|
594 |
+
'PubChemFP740': ('[#8]c1cc([#8])ccc1', 0),
|
595 |
+
'PubChemFP741': ('[#8]c1cc([#16])ccc1', 0),
|
596 |
+
'PubChemFP742': ('[#8]c1cc([#7])ccc1', 0),
|
597 |
+
'PubChemFP743': ('[#8]c1cc(Cl)ccc1', 0),
|
598 |
+
'PubChemFP744': ('[#8]c1cc(Br)ccc1', 0),
|
599 |
+
'PubChemFP745': ('[#16]c1cc([#16])ccc1', 0),
|
600 |
+
'PubChemFP746': ('[#16]c1cc([#7])ccc1', 0),
|
601 |
+
'PubChemFP747': ('[#16]c1cc(Cl)ccc1', 0),
|
602 |
+
'PubChemFP748': ('[#16]c1cc(Br)ccc1', 0),
|
603 |
+
'PubChemFP749': ('[#7]c1cc([#7])ccc1', 0),
|
604 |
+
'PubChemFP750': ('[#7]c1cc(Cl)ccc1', 0),
|
605 |
+
'PubChemFP751': ('[#7]c1cc(Br)ccc1', 0),
|
606 |
+
'PubChemFP752': ('Clc1cc(Cl)ccc1', 0),
|
607 |
+
'PubChemFP753': ('Clc1cc(Br)ccc1', 0),
|
608 |
+
'PubChemFP754': ('Brc1cc(Br)ccc1', 0),
|
609 |
+
'PubChemFP755': ('[#6]c1c([#6])cccc1', 0),
|
610 |
+
'PubChemFP756': ('[#6]c1c([#8])cccc1', 0),
|
611 |
+
'PubChemFP757': ('[#6]c1c([#16])cccc1', 0),
|
612 |
+
'PubChemFP758': ('[#6]c1c([#7])cccc1', 0),
|
613 |
+
'PubChemFP759': ('[#6]c1c(Cl)cccc1', 0),
|
614 |
+
'PubChemFP760': ('[#6]c1c(Br)cccc1', 0),
|
615 |
+
'PubChemFP761': ('[#8]c1c([#8])cccc1', 0),
|
616 |
+
'PubChemFP762': ('[#8]c1c([#16])cccc1', 0),
|
617 |
+
'PubChemFP763': ('[#8]c1c([#7])cccc1', 0),
|
618 |
+
'PubChemFP764': ('[#8]c1c(Cl)cccc1', 0),
|
619 |
+
'PubChemFP765': ('[#8]c1c(Br)cccc1', 0),
|
620 |
+
'PubChemFP766': ('[#16]c1c([#16])cccc1', 0),
|
621 |
+
'PubChemFP767': ('[#16]c1c([#7])cccc1', 0),
|
622 |
+
'PubChemFP768': ('[#16]c1c(Cl)cccc1', 0),
|
623 |
+
'PubChemFP769': ('[#16]c1c(Br)cccc1', 0),
|
624 |
+
'PubChemFP770': ('[#7]c1c([#7])cccc1', 0),
|
625 |
+
'PubChemFP771': ('[#7]c1c(Cl)cccc1', 0),
|
626 |
+
'PubChemFP772': ('[#7]c1c(Br)cccc1', 0),
|
627 |
+
'PubChemFP773': ('Clc1c(Cl)cccc1', 0),
|
628 |
+
'PubChemFP774': ('Clc1c(Br)cccc1', 0),
|
629 |
+
'PubChemFP775': ('Brc1c(Br)cccc1', 0),
|
630 |
+
'PubChemFP776': ('[#6][#6]1[#6][#6][#6]([#6])[#6][#6]1', 0),
|
631 |
+
'PubChemFP777': ('[#6][#6]1[#6][#6][#6]([#8])[#6][#6]1', 0),
|
632 |
+
'PubChemFP778': ('[#6][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
|
633 |
+
'PubChemFP779': ('[#6][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
|
634 |
+
'PubChemFP780': ('[#6][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
635 |
+
'PubChemFP781': ('[#6][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
636 |
+
'PubChemFP782': ('[#8][#6]1[#6][#6][#6]([#8])[#6][#6]1', 0),
|
637 |
+
'PubChemFP783': ('[#8][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
|
638 |
+
'PubChemFP784': ('[#8][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
|
639 |
+
'PubChemFP785': ('[#8][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
640 |
+
'PubChemFP786': ('[#8][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
641 |
+
'PubChemFP787': ('[#16][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
|
642 |
+
'PubChemFP788': ('[#16][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
|
643 |
+
'PubChemFP789': ('[#16][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
644 |
+
'PubChemFP790': ('[#16][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
645 |
+
'PubChemFP791': ('[#7][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
|
646 |
+
'PubChemFP792': ('[#7][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
647 |
+
'PubChemFP793': ('[#7][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
648 |
+
'PubChemFP794': ('Cl[#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
649 |
+
'PubChemFP795': ('Cl[#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
650 |
+
'PubChemFP796': ('Br[#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
651 |
+
'PubChemFP797': ('[#6][#6]1[#6][#6]([#6])[#6][#6][#6]1', 0),
|
652 |
+
'PubChemFP798': ('[#6][#6]1[#6][#6]([#8])[#6][#6][#6]1', 0),
|
653 |
+
'PubChemFP799': ('[#6][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
|
654 |
+
'PubChemFP800': ('[#6][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
|
655 |
+
'PubChemFP801': ('[#6][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
656 |
+
'PubChemFP802': ('[#6][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
657 |
+
'PubChemFP803': ('[#8][#6]1[#6][#6]([#8])[#6][#6][#6]1', 0),
|
658 |
+
'PubChemFP804': ('[#8][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
|
659 |
+
'PubChemFP805': ('[#8][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
|
660 |
+
'PubChemFP806': ('[#8][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
661 |
+
'PubChemFP807': ('[#8][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
662 |
+
'PubChemFP808': ('[#16][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
|
663 |
+
'PubChemFP809': ('[#16][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
|
664 |
+
'PubChemFP810': ('[#16][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
665 |
+
'PubChemFP811': ('[#16][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
666 |
+
'PubChemFP812': ('[#7][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
|
667 |
+
'PubChemFP813': ('[#7][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
668 |
+
'PubChemFP814': ('[#7][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
669 |
+
'PubChemFP815': ('Cl[#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
670 |
+
'PubChemFP816': ('Cl[#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
671 |
+
'PubChemFP817': ('Br[#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
672 |
+
'PubChemFP818': ('[#6][#6]1[#6]([#6])[#6][#6][#6][#6]1', 0),
|
673 |
+
'PubChemFP819': ('[#6][#6]1[#6]([#8])[#6][#6][#6][#6]1', 0),
|
674 |
+
'PubChemFP820': ('[#6][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
|
675 |
+
'PubChemFP821': ('[#6][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
|
676 |
+
'PubChemFP822': ('[#6][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
677 |
+
'PubChemFP823': ('[#6][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
678 |
+
'PubChemFP824': ('[#8][#6]1[#6]([#8])[#6][#6][#6][#6]1', 0),
|
679 |
+
'PubChemFP825': ('[#8][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
|
680 |
+
'PubChemFP826': ('[#8][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
|
681 |
+
'PubChemFP827': ('[#8][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
682 |
+
'PubChemFP828': ('[#8][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
683 |
+
'PubChemFP829': ('[#16][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
|
684 |
+
'PubChemFP830': ('[#16][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
|
685 |
+
'PubChemFP831': ('[#16][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
686 |
+
'PubChemFP832': ('[#16][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
687 |
+
'PubChemFP833': ('[#7][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
|
688 |
+
'PubChemFP834': ('[#7][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
689 |
+
'PubChemFP835': ('[#7][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
690 |
+
'PubChemFP836': ('Cl[#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
691 |
+
'PubChemFP837': ('Cl[#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
692 |
+
'PubChemFP838': ('Br[#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
693 |
+
'PubChemFP839': ('[#6][#6]1[#6][#6]([#6])[#6][#6]1', 0),
|
694 |
+
'PubChemFP840': ('[#6][#6]1[#6][#6]([#8])[#6][#6]1', 0),
|
695 |
+
'PubChemFP841': ('[#6][#6]1[#6][#6]([#16])[#6][#6]1', 0),
|
696 |
+
'PubChemFP842': ('[#6][#6]1[#6][#6]([#7])[#6][#6]1', 0),
|
697 |
+
'PubChemFP843': ('[#6][#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
698 |
+
'PubChemFP844': ('[#6][#6]1[#6][#6](Br)[#6][#6]1', 0),
|
699 |
+
'PubChemFP845': ('[#8][#6]1[#6][#6]([#8])[#6][#6]1', 0),
|
700 |
+
'PubChemFP846': ('[#8][#6]1[#6][#6]([#16])[#6][#6]1', 0),
|
701 |
+
'PubChemFP847': ('[#8][#6]1[#6][#6]([#7])[#6][#6]1', 0),
|
702 |
+
'PubChemFP848': ('[#8][#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
703 |
+
'PubChemFP849': ('[#8][#6]1[#6][#6](Br)[#6][#6]1', 0),
|
704 |
+
'PubChemFP850': ('[#16][#6]1[#6][#6]([#16])[#6][#6]1', 0),
|
705 |
+
'PubChemFP851': ('[#16][#6]1[#6][#6]([#7])[#6][#6]1', 0),
|
706 |
+
'PubChemFP852': ('[#16][#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
707 |
+
'PubChemFP853': ('[#16][#6]1[#6][#6](Br)[#6][#6]1', 0),
|
708 |
+
'PubChemFP854': ('[#7][#6]1[#6][#6]([#7])[#6][#6]1', 0),
|
709 |
+
'PubChemFP855': ('[#7][#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
710 |
+
'PubChemFP856': ('[#7][#6]1[#6][#6](Br)[#6][#6]1', 0),
|
711 |
+
'PubChemFP857': ('Cl[#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
712 |
+
'PubChemFP858': ('Cl[#6]1[#6][#6](Br)[#6][#6]1', 0),
|
713 |
+
'PubChemFP859': ('Br[#6]1[#6][#6](Br)[#6][#6]1', 0),
|
714 |
+
'PubChemFP860': ('[#6][#6]1[#6]([#6])[#6][#6][#6]1', 0),
|
715 |
+
'PubChemFP861': ('[#6][#6]1[#6]([#8])[#6][#6][#6]1', 0),
|
716 |
+
'PubChemFP862': ('[#6][#6]1[#6]([#16])[#6][#6][#6]1', 0),
|
717 |
+
'PubChemFP863': ('[#6][#6]1[#6]([#7])[#6][#6][#6]1', 0),
|
718 |
+
'PubChemFP864': ('[#6][#6]1[#6](Cl)[#6][#6][#6]1', 0),
|
719 |
+
'PubChemFP865': ('[#6][#6]1[#6](Br)[#6][#6][#6]1', 0),
|
720 |
+
'PubChemFP866': ('[#8][#6]1[#6]([#8])[#6][#6][#6]1', 0),
|
721 |
+
'PubChemFP867': ('[#8][#6]1[#6]([#16])[#6][#6][#6]1', 0),
|
722 |
+
'PubChemFP868': ('[#8][#6]1[#6]([#7])[#6][#6][#6]1', 0),
|
723 |
+
'PubChemFP869': ('[#8][#6]1[#6](Cl)[#6][#6][#6]1', 0),
|
724 |
+
'PubChemFP870': ('[#8][#6]1[#6](Br)[#6][#6][#6]1', 0),
|
725 |
+
'PubChemFP871': ('[#16][#6]1[#6]([#16])[#6][#6][#6]1', 0),
|
726 |
+
'PubChemFP872': ('[#16][#6]1[#6]([#7])[#6][#6][#6]1', 0),
|
727 |
+
'PubChemFP873': ('[#16][#6]1[#6](Cl)[#6][#6][#6]1', 0),
|
728 |
+
'PubChemFP874': ('[#16][#6]1[#6](Br)[#6][#6][#6]1', 0),
|
729 |
+
'PubChemFP875': ('[#7][#6]1[#6]([#7])[#6][#6][#6]1', 0),
|
730 |
+
'PubChemFP876': ('[#7][#6]1[#6](Cl)[#6][#6]1', 0),
|
731 |
+
'PubChemFP877': ('[#7][#6]1[#6](Br)[#6][#6][#6]1', 0),
|
732 |
+
'PubChemFP878': ('Cl[#6]1[#6](Cl)[#6][#6][#6]1', 0),
|
733 |
+
'PubChemFP879': ('Cl[#6]1[#6](Br)[#6][#6][#6]1', 0),
|
734 |
+
'PubChemFP880': ('Br[#6]1[#6](Br)[#6][#6][#6]1', 0)}
|
deepscreen/data/featurizers/fingerprint/torsions.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from rdkit.Chem.AtomPairs import Torsions
|
2 |
+
from rdkit.Chem import DataStructs
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
_type = 'topological-based'
|
6 |
+
|
7 |
+
|
8 |
+
def GetTorsionFPs(mol, nBits=2048, binary=True):
|
9 |
+
'''
|
10 |
+
atompairs fingerprints
|
11 |
+
'''
|
12 |
+
fp = Torsions.GetHashedTopologicalTorsionFingerprint(mol, nBits=nBits)
|
13 |
+
if binary:
|
14 |
+
arr = np.zeros((0,), dtype=np.bool_)
|
15 |
+
else:
|
16 |
+
arr = np.zeros((0,), dtype=np.int8)
|
17 |
+
DataStructs.ConvertToNumpyArray(fp, arr)
|
18 |
+
return arr
|
deepscreen/data/featurizers/graph.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import networkx as nx
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
from rdkit import Chem
|
5 |
+
from torch_geometric.utils import from_smiles
|
6 |
+
from torch_geometric.data import Data
|
7 |
+
|
8 |
+
from deepscreen.data.featurizers.categorical import one_of_k_encoding_unk, one_of_k_encoding
|
9 |
+
from deepscreen.utils import get_logger
|
10 |
+
|
11 |
+
log = get_logger(__name__)
|
12 |
+
|
13 |
+
|
14 |
+
def atom_features(atom, explicit_H=False, use_chirality=True):
|
15 |
+
"""
|
16 |
+
Adapted from TransformerCPI 2.0
|
17 |
+
"""
|
18 |
+
symbol = ['C', 'N', 'O', 'F', 'P', 'S', 'Cl', 'Br', 'I', 'other'] # 10-dim
|
19 |
+
degree = [0, 1, 2, 3, 4, 5, 6] # 7-dim
|
20 |
+
hybridization_type = [Chem.rdchem.HybridizationType.SP,
|
21 |
+
Chem.rdchem.HybridizationType.SP2,
|
22 |
+
Chem.rdchem.HybridizationType.SP3,
|
23 |
+
Chem.rdchem.HybridizationType.SP3D,
|
24 |
+
Chem.rdchem.HybridizationType.SP3D2,
|
25 |
+
'other'] # 6-dim
|
26 |
+
|
27 |
+
# 10+7+2+6+1=26
|
28 |
+
results = one_of_k_encoding_unk(atom.GetSymbol(), symbol) + \
|
29 |
+
one_of_k_encoding(atom.GetDegree(), degree) + \
|
30 |
+
[atom.GetFormalCharge(), atom.GetNumRadicalElectrons()] + \
|
31 |
+
one_of_k_encoding_unk(atom.GetHybridization(), hybridization_type) + [atom.GetIsAromatic()]
|
32 |
+
|
33 |
+
# In case of explicit hydrogen(QM8, QM9), avoid calling `GetTotalNumHs`
|
34 |
+
# 26+5=31
|
35 |
+
if not explicit_H:
|
36 |
+
results = results + one_of_k_encoding_unk(atom.GetTotalNumHs(),
|
37 |
+
[0, 1, 2, 3, 4])
|
38 |
+
# 31+3=34
|
39 |
+
if use_chirality:
|
40 |
+
try:
|
41 |
+
results = results + one_of_k_encoding_unk(
|
42 |
+
atom.GetProp('_CIPCode'),
|
43 |
+
['R', 'S']) + [atom.HasProp('_ChiralityPossible')]
|
44 |
+
except:
|
45 |
+
results = results + [False, False] + [atom.HasProp('_ChiralityPossible')]
|
46 |
+
|
47 |
+
return np.array(results)
|
48 |
+
|
49 |
+
|
50 |
+
def bond_features(bond):
|
51 |
+
bt = bond.GetBondType()
|
52 |
+
return np.array(
|
53 |
+
[bt == Chem.rdchem.BondType.SINGLE, bt == Chem.rdchem.BondType.DOUBLE, bt == Chem.rdchem.BondType.TRIPLE,
|
54 |
+
bt == Chem.rdchem.BondType.AROMATIC, bond.GetIsConjugated(), bond.IsInRing()])
|
55 |
+
|
56 |
+
|
57 |
+
def smiles_to_graph_pyg(smiles):
|
58 |
+
"""
|
59 |
+
Convert SMILES to graph with the default method defined by PyTorch Geometric
|
60 |
+
"""
|
61 |
+
try:
|
62 |
+
return from_smiles(smiles)
|
63 |
+
except Exception as e:
|
64 |
+
log.warning(f"Failed to featurize the following SMILES to graph: {smiles} due to {str(e)}")
|
65 |
+
return None
|
66 |
+
|
67 |
+
|
68 |
+
def smiles_to_graph(smiles, atom_features: callable = atom_features):
|
69 |
+
"""
|
70 |
+
Convert SMILES to graph with custom atom_features
|
71 |
+
"""
|
72 |
+
try:
|
73 |
+
mol = Chem.MolFromSmiles(smiles)
|
74 |
+
|
75 |
+
features = []
|
76 |
+
for atom in mol.GetAtoms():
|
77 |
+
feature = atom_features(atom)
|
78 |
+
features.append(feature / sum(feature))
|
79 |
+
features = np.array(features)
|
80 |
+
|
81 |
+
edges = []
|
82 |
+
for bond in mol.GetBonds():
|
83 |
+
edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])
|
84 |
+
g = nx.Graph(edges).to_directed()
|
85 |
+
|
86 |
+
if len(edges) == 0:
|
87 |
+
edge_index = [[0, 0]]
|
88 |
+
else:
|
89 |
+
edge_index = []
|
90 |
+
for e1, e2 in g.edges:
|
91 |
+
edge_index.append([e1, e2])
|
92 |
+
|
93 |
+
return Data(x=torch.Tensor(features),
|
94 |
+
edge_index=torch.LongTensor(edge_index).transpose(0, 1))
|
95 |
+
|
96 |
+
except Exception as e:
|
97 |
+
log.warning(f"Failed to convert SMILES ({smiles}) to graph due to {str(e)}")
|
98 |
+
return None
|
99 |
+
# features = []
|
100 |
+
# for atom in mol.GetAtoms():
|
101 |
+
# feature = atom_features(atom)
|
102 |
+
# features.append(feature / sum(feature))
|
103 |
+
#
|
104 |
+
# edge_indices = []
|
105 |
+
# for bond in mol.GetBonds():
|
106 |
+
# i = bond.GetBeginAtomIdx()
|
107 |
+
# j = bond.GetEndAtomIdx()
|
108 |
+
# edge_indices += [[i, j], [j, i]]
|
109 |
+
#
|
110 |
+
# edge_index = torch.tensor(edge_indices)
|
111 |
+
# edge_index = edge_index.t().to(torch.long).view(2, -1)
|
112 |
+
#
|
113 |
+
# if edge_index.numel() > 0: # Sort indices.
|
114 |
+
# perm = (edge_index[0] * x.size(0) + edge_index[1]).argsort()
|
115 |
+
# edge_index = edge_index[:, perm]
|
116 |
+
#
|
117 |
+
|
118 |
+
|
119 |
+
def smiles_to_mol_features(smiles, num_atom_feat: callable):
|
120 |
+
try:
|
121 |
+
mol = Chem.MolFromSmiles(smiles)
|
122 |
+
num_atom_feat = len(atom_features(mol.GetAtoms()[0]))
|
123 |
+
atom_feat = np.zeros((mol.GetNumAtoms(), num_atom_feat))
|
124 |
+
for atom in mol.GetAtoms():
|
125 |
+
atom_feat[atom.GetIdx(), :] = atom_features(atom)
|
126 |
+
adj = Chem.GetAdjacencyMatrix(mol)
|
127 |
+
adj_mat = np.array(adj)
|
128 |
+
|
129 |
+
return atom_feat, adj_mat
|
130 |
+
|
131 |
+
except Exception as e:
|
132 |
+
log.warning(f"Failed to featurize the following SMILES to molecular features: {smiles} due to {str(e)}")
|
133 |
+
return None
|
deepscreen/data/featurizers/monn.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from rdkit.Chem import MolFromSmiles
|
3 |
+
|
4 |
+
from deepscreen.data.featurizers.categorical import FASTA_VOCAB, fasta_to_label
|
5 |
+
from deepscreen.data.featurizers.graph import atom_features, bond_features
|
6 |
+
|
7 |
+
|
8 |
+
def get_mask(arr):
|
9 |
+
a = np.zeros(1, len(arr))
|
10 |
+
a[1, :arr.shape[0]] = 1
|
11 |
+
return a
|
12 |
+
|
13 |
+
|
14 |
+
def add_index(input_array, ebd_size):
|
15 |
+
batch_size, n_vertex, n_nbs = np.shape(input_array)
|
16 |
+
add_idx = np.array(range(0, ebd_size * batch_size, ebd_size) * (n_nbs * n_vertex))
|
17 |
+
add_idx = np.transpose(add_idx.reshape(-1, batch_size))
|
18 |
+
add_idx = add_idx.reshape(-1)
|
19 |
+
new_array = input_array.reshape(-1) + add_idx
|
20 |
+
return new_array
|
21 |
+
|
22 |
+
|
23 |
+
# TODO fix padding and masking
|
24 |
+
def drug_featurizer(smiles, max_neighbors=6):
|
25 |
+
mol = MolFromSmiles(smiles)
|
26 |
+
|
27 |
+
# convert molecule to GNN input
|
28 |
+
n_atoms = mol.GetNumAtoms()
|
29 |
+
assert mol.GetNumBonds() >= 0
|
30 |
+
|
31 |
+
n_bonds = max(mol.GetNumBonds(), 1)
|
32 |
+
feat_atoms = np.zeros((n_atoms,)) # atom feature ID
|
33 |
+
feat_bonds = np.zeros((n_bonds,)) # bond feature ID
|
34 |
+
atom_adj = np.zeros((n_atoms, max_neighbors))
|
35 |
+
bond_adj = np.zeros((n_atoms, max_neighbors))
|
36 |
+
n_neighbors = np.zeros((n_atoms,))
|
37 |
+
neighbor_mask = np.zeros((n_atoms, max_neighbors))
|
38 |
+
|
39 |
+
for atom in mol.GetAtoms():
|
40 |
+
idx = atom.GetIdx()
|
41 |
+
feat_atoms[idx] = atom_features(atom)
|
42 |
+
|
43 |
+
for bond in mol.GetBonds():
|
44 |
+
a1 = bond.GetBeginAtom().GetIdx()
|
45 |
+
a2 = bond.GetEndAtom().GetIdx()
|
46 |
+
idx = bond.GetIdx()
|
47 |
+
feat_bonds[idx] = bond_features(bond)
|
48 |
+
try:
|
49 |
+
atom_adj[a1, n_neighbors[a1]] = a2
|
50 |
+
atom_adj[a2, n_neighbors[a2]] = a1
|
51 |
+
except:
|
52 |
+
return [], [], [], [], []
|
53 |
+
bond_adj[a1, n_neighbors[a1]] = idx
|
54 |
+
bond_adj[a2, n_neighbors[a2]] = idx
|
55 |
+
n_neighbors[a1] += 1
|
56 |
+
n_neighbors[a2] += 1
|
57 |
+
|
58 |
+
for i in range(len(n_neighbors)):
|
59 |
+
neighbor_mask[i, :n_neighbors[i]] = 1
|
60 |
+
|
61 |
+
vertex_mask = get_mask(feat_atoms)
|
62 |
+
# vertex = pack_1d(feat_atoms)
|
63 |
+
# edge = pack_1d(feat_bonds)
|
64 |
+
# atom_adj = pack_2d(atom_adj)
|
65 |
+
# bond_adj = pack_2d(bond_adj)
|
66 |
+
# nbs_mask = pack_2d(n_neighbors_mat)
|
67 |
+
|
68 |
+
atom_adj = add_index(atom_adj, np.shape(atom_adj)[1])
|
69 |
+
bond_adj = add_index(bond_adj, np.shape(feat_bonds)[1])
|
70 |
+
|
71 |
+
return vertex_mask, feat_atoms, feat_bonds, atom_adj, bond_adj, neighbor_mask
|
72 |
+
|
73 |
+
|
74 |
+
# TODO WIP the pairwise_label matrix probably should be generated beforehand and stored as an extra label in the dataset
|
75 |
+
def get_pairwise_label(pdbid, interaction_dict, mol):
|
76 |
+
if pdbid in interaction_dict:
|
77 |
+
sdf_element = np.array([atom.GetSymbol().upper() for atom in mol.GetAtoms()])
|
78 |
+
atom_element = np.array(interaction_dict[pdbid]['atom_element'], dtype=str)
|
79 |
+
atom_name_list = np.array(interaction_dict[pdbid]['atom_name'], dtype=str)
|
80 |
+
atom_interact = np.array(interaction_dict[pdbid]['atom_interact'], dtype=int)
|
81 |
+
nonH_position = np.where(atom_element != 'H')[0]
|
82 |
+
assert sum(atom_element[nonH_position] != sdf_element) == 0
|
83 |
+
|
84 |
+
atom_name_list = atom_name_list[nonH_position].tolist()
|
85 |
+
pairwise_mat = np.zeros((len(nonH_position), len(interaction_dict[pdbid]['uniprot_seq'])), dtype=np.int32)
|
86 |
+
for atom_name, bond_type in interaction_dict[pdbid]['atom_bond_type']:
|
87 |
+
atom_idx = atom_name_list.index(str(atom_name))
|
88 |
+
assert atom_idx < len(nonH_position)
|
89 |
+
|
90 |
+
seq_idx_list = []
|
91 |
+
for seq_idx, bond_type_seq in interaction_dict[pdbid]['residue_bond_type']:
|
92 |
+
if bond_type == bond_type_seq:
|
93 |
+
seq_idx_list.append(seq_idx)
|
94 |
+
pairwise_mat[atom_idx, seq_idx] = 1
|
95 |
+
if len(np.where(pairwise_mat != 0)[0]) != 0:
|
96 |
+
pairwise_mask = True
|
97 |
+
return True, pairwise_mat
|
98 |
+
return False, np.zeros((1, 1))
|
99 |
+
|
100 |
+
|
101 |
+
def protein_featurizer(fasta):
|
102 |
+
sequence = fasta_to_label(fasta)
|
103 |
+
# pad proteins and make masks
|
104 |
+
seq_mask = get_mask(sequence)
|
105 |
+
|
106 |
+
return seq_mask, sequence
|
deepscreen/data/featurizers/token.py
ADDED
@@ -0,0 +1,299 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import collections
|
2 |
+
from importlib import resources
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
from typing import Optional, List
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
from transformers import BertTokenizer
|
9 |
+
|
10 |
+
SMI_REGEX_PATTERN = r"""(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"""
|
11 |
+
# \[[^\]]+\] # match anything inside square brackets
|
12 |
+
# |Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p # match elements
|
13 |
+
# |\(|\) # match parentheses
|
14 |
+
# |\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2} # match various symbols
|
15 |
+
# |[0-9] # match digits
|
16 |
+
|
17 |
+
|
18 |
+
def sequence_to_kmers(sequence, k=3):
|
19 |
+
""" Divide a string into a list of kmers strings.
|
20 |
+
|
21 |
+
Parameters:
|
22 |
+
sequence (string)
|
23 |
+
k (int), default 3
|
24 |
+
Returns:
|
25 |
+
List containing a list of kmers.
|
26 |
+
"""
|
27 |
+
return [sequence[i:i + k] for i in range(len(sequence) - k + 1)]
|
28 |
+
|
29 |
+
|
30 |
+
def sequence_to_word_embedding(sequence, model):
|
31 |
+
"""Get protein embedding, infer a list of 3-mers to (num_word, 100) matrix"""
|
32 |
+
kmers = sequence_to_kmers(sequence)
|
33 |
+
vec = np.zeros((len(kmers), 100))
|
34 |
+
i = 0
|
35 |
+
for word in kmers:
|
36 |
+
try:
|
37 |
+
vec[i,] = model.wv[word]
|
38 |
+
except KeyError:
|
39 |
+
pass
|
40 |
+
i += 1
|
41 |
+
return vec
|
42 |
+
|
43 |
+
|
44 |
+
def sequence_to_token_ids(sequence, tokenizer):
|
45 |
+
token_ids = tokenizer.encode(sequence)
|
46 |
+
return np.array(token_ids)
|
47 |
+
|
48 |
+
|
49 |
+
# def sequence_to_token_ids(sequence, tokenizer, max_length: int):
|
50 |
+
# token_ids = tokenizer.encode(sequence)
|
51 |
+
# length = min(max_length, len(token_ids))
|
52 |
+
#
|
53 |
+
# token_ids_padded = np.zeros(max_length, dtype='int')
|
54 |
+
# token_ids_padded[:length] = token_ids[:length]
|
55 |
+
#
|
56 |
+
# return token_ids_padded
|
57 |
+
|
58 |
+
|
59 |
+
class SmilesTokenizer(BertTokenizer):
|
60 |
+
"""
|
61 |
+
Adapted from https://github.com/deepchem/deepchem/.
|
62 |
+
|
63 |
+
Creates the SmilesTokenizer class. The tokenizer heavily inherits from the BertTokenizer
|
64 |
+
implementation found in Huggingface's transformers library. It runs a WordPiece tokenization
|
65 |
+
algorithm over SMILES strings using the tokenization SMILES regex developed by Schwaller et al.
|
66 |
+
|
67 |
+
Please see https://github.com/huggingface/transformers
|
68 |
+
and https://github.com/rxn4chemistry/rxnfp for more details.
|
69 |
+
|
70 |
+
Examples
|
71 |
+
--------
|
72 |
+
>>> tokenizer = SmilesTokenizer(vocab_path, regex_pattern)
|
73 |
+
>>> print(tokenizer.encode("CC(=O)OC1=CC=CC=C1C(=O)O"))
|
74 |
+
[12, 16, 16, 17, 22, 19, 18, 19, 16, 20, 22, 16, 16, 22, 16, 16, 22, 16, 20, 16, 17, 22, 19, 18, 19, 13]
|
75 |
+
|
76 |
+
|
77 |
+
References
|
78 |
+
----------
|
79 |
+
.. [1] Schwaller, Philippe; Probst, Daniel; Vaucher, Alain C.; Nair, Vishnu H; Kreutter, David;
|
80 |
+
Laino, Teodoro; et al. (2019): Mapping the Space of Chemical Reactions using Attention-Based Neural
|
81 |
+
Networks. ChemRxiv. Preprint. https://doi.org/10.26434/chemrxiv.9897365.v3
|
82 |
+
|
83 |
+
Note
|
84 |
+
----
|
85 |
+
This class requires huggingface's transformers and tokenizers libraries to be installed.
|
86 |
+
"""
|
87 |
+
|
88 |
+
def __init__(
|
89 |
+
self,
|
90 |
+
vocab_file: str = 'resources/vocabs/smiles.txt',
|
91 |
+
regex_pattern: str = SMI_REGEX_PATTERN,
|
92 |
+
# unk_token="[UNK]",
|
93 |
+
# sep_token="[SEP]",
|
94 |
+
# pad_token="[PAD]",
|
95 |
+
# cls_token="[CLS]",
|
96 |
+
# mask_token="[MASK]",
|
97 |
+
**kwargs):
|
98 |
+
"""Constructs a SmilesTokenizer.
|
99 |
+
|
100 |
+
Parameters
|
101 |
+
----------
|
102 |
+
vocab_file: str
|
103 |
+
Path to a SMILES character per line vocabulary file.
|
104 |
+
Default vocab file is found in deepchem/feat/tests/data/vocab.txt
|
105 |
+
"""
|
106 |
+
|
107 |
+
super().__init__(vocab_file, **kwargs)
|
108 |
+
|
109 |
+
if not os.path.isfile(vocab_file):
|
110 |
+
raise ValueError(
|
111 |
+
"Can't find a vocab file at path '{}'.".format(vocab_file))
|
112 |
+
self.vocab = load_vocab(vocab_file)
|
113 |
+
unused_indexes = [i for i, v in enumerate(self.vocab.keys()) if v.startswith("[unused")]
|
114 |
+
self.highest_unused_index = 0 if len(unused_indexes) == 0 else max(unused_indexes)
|
115 |
+
self.ids_to_tokens = collections.OrderedDict([
|
116 |
+
(ids, tok) for tok, ids in self.vocab.items()
|
117 |
+
])
|
118 |
+
self.basic_tokenizer = BasicSmilesTokenizer(regex_pattern=regex_pattern)
|
119 |
+
|
120 |
+
@property
|
121 |
+
def vocab_size(self):
|
122 |
+
return len(self.vocab)
|
123 |
+
|
124 |
+
@property
|
125 |
+
def vocab_list(self):
|
126 |
+
return list(self.vocab.keys())
|
127 |
+
|
128 |
+
def _tokenize(self, text: str, max_seq_length: int = 512, **kwargs):
|
129 |
+
"""Tokenize a string into a list of tokens.
|
130 |
+
|
131 |
+
Parameters
|
132 |
+
----------
|
133 |
+
text: str
|
134 |
+
Input string sequence to be tokenized.
|
135 |
+
"""
|
136 |
+
|
137 |
+
max_len_single_sentence = max_seq_length - 2
|
138 |
+
split_tokens = [
|
139 |
+
token for token in self.basic_tokenizer.tokenize(text)
|
140 |
+
[:max_len_single_sentence]
|
141 |
+
]
|
142 |
+
return split_tokens
|
143 |
+
|
144 |
+
def _convert_token_to_id(self, token: str):
|
145 |
+
"""Converts a token (str/unicode) in an id using the vocab.
|
146 |
+
|
147 |
+
Parameters
|
148 |
+
----------
|
149 |
+
token: str
|
150 |
+
String token from a larger sequence to be converted to a numerical id.
|
151 |
+
"""
|
152 |
+
|
153 |
+
return self.vocab.get(token, self.vocab.get(self.unk_token))
|
154 |
+
|
155 |
+
def _convert_id_to_token(self, index: int):
|
156 |
+
"""Converts an index (integer) in a token (string/unicode) using the vocab.
|
157 |
+
|
158 |
+
Parameters
|
159 |
+
----------
|
160 |
+
index: int
|
161 |
+
Integer index to be converted back to a string-based token as part of a larger sequence.
|
162 |
+
"""
|
163 |
+
|
164 |
+
return self.ids_to_tokens.get(index, self.unk_token)
|
165 |
+
|
166 |
+
def convert_tokens_to_string(self, tokens: List[str]):
|
167 |
+
"""Converts a sequence of tokens (string) in a single string.
|
168 |
+
|
169 |
+
Parameters
|
170 |
+
----------
|
171 |
+
tokens: List[str]
|
172 |
+
List of tokens for a given string sequence.
|
173 |
+
|
174 |
+
Returns
|
175 |
+
-------
|
176 |
+
out_string: str
|
177 |
+
Single string from combined tokens.
|
178 |
+
"""
|
179 |
+
|
180 |
+
out_string: str = " ".join(tokens).replace(" ##", "").strip()
|
181 |
+
return out_string
|
182 |
+
|
183 |
+
def add_special_tokens_ids_single_sequence(self,
|
184 |
+
token_ids: List[Optional[int]]):
|
185 |
+
"""Adds special tokens to a sequence for sequence classification tasks.
|
186 |
+
|
187 |
+
A BERT sequence has the following format: [CLS] X [SEP]
|
188 |
+
|
189 |
+
Parameters
|
190 |
+
----------
|
191 |
+
token_ids: list[int]
|
192 |
+
list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
|
193 |
+
"""
|
194 |
+
|
195 |
+
return [self.cls_token_id] + token_ids + [self.sep_token_id]
|
196 |
+
|
197 |
+
def add_special_tokens_single_sequence(self, tokens: List[str]):
|
198 |
+
"""Adds special tokens to the a sequence for sequence classification tasks.
|
199 |
+
A BERT sequence has the following format: [CLS] X [SEP]
|
200 |
+
|
201 |
+
Parameters
|
202 |
+
----------
|
203 |
+
tokens: List[str]
|
204 |
+
List of tokens for a given string sequence.
|
205 |
+
"""
|
206 |
+
return [self.cls_token] + tokens + [self.sep_token]
|
207 |
+
|
208 |
+
def add_special_tokens_ids_sequence_pair(
|
209 |
+
self, token_ids_0: List[Optional[int]],
|
210 |
+
token_ids_1: List[Optional[int]]) -> List[Optional[int]]:
|
211 |
+
"""Adds special tokens to a sequence pair for sequence classification tasks.
|
212 |
+
A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
|
213 |
+
|
214 |
+
Parameters
|
215 |
+
----------
|
216 |
+
token_ids_0: List[int]
|
217 |
+
List of ids for the first string sequence in the sequence pair (A).
|
218 |
+
token_ids_1: List[int]
|
219 |
+
List of tokens for the second string sequence in the sequence pair (B).
|
220 |
+
"""
|
221 |
+
|
222 |
+
sep = [self.sep_token_id]
|
223 |
+
cls = [self.cls_token_id]
|
224 |
+
|
225 |
+
return cls + token_ids_0 + sep + token_ids_1 + sep
|
226 |
+
|
227 |
+
def add_padding_tokens(self,
|
228 |
+
token_ids: List[Optional[int]],
|
229 |
+
length: int,
|
230 |
+
right: bool = True) -> List[Optional[int]]:
|
231 |
+
"""Adds padding tokens to return a sequence of length max_length.
|
232 |
+
By default padding tokens are added to the right of the sequence.
|
233 |
+
|
234 |
+
Parameters
|
235 |
+
----------
|
236 |
+
token_ids: list[optional[int]]
|
237 |
+
list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
|
238 |
+
length: int
|
239 |
+
right: bool, default True
|
240 |
+
|
241 |
+
Returns
|
242 |
+
-------
|
243 |
+
List[int]
|
244 |
+
"""
|
245 |
+
padding = [self.pad_token_id] * (length - len(token_ids))
|
246 |
+
|
247 |
+
if right:
|
248 |
+
return token_ids + padding
|
249 |
+
else:
|
250 |
+
return padding + token_ids
|
251 |
+
|
252 |
+
|
253 |
+
class BasicSmilesTokenizer(object):
|
254 |
+
"""
|
255 |
+
Adapted from https://github.com/deepchem/deepchem/.
|
256 |
+
Run basic SMILES tokenization using a regex pattern developed by Schwaller et. al.
|
257 |
+
This tokenizer is to be used when a tokenizer that does not require the transformers library by HuggingFace is required.
|
258 |
+
|
259 |
+
Examples
|
260 |
+
--------
|
261 |
+
>>> tokenizer = BasicSmilesTokenizer()
|
262 |
+
>>> print(tokenizer.tokenize("CC(=O)OC1=CC=CC=C1C(=O)O"))
|
263 |
+
['C', 'C', '(', '=', 'O', ')', 'O', 'C', '1', '=', 'C', 'C', '=', 'C', 'C', '=', 'C', '1', 'C', '(', '=', 'O', ')', 'O']
|
264 |
+
|
265 |
+
|
266 |
+
References
|
267 |
+
----------
|
268 |
+
.. [1] Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee
|
269 |
+
ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction
|
270 |
+
1572-1583 DOI: 10.1021/acscentsci.9b00576
|
271 |
+
"""
|
272 |
+
|
273 |
+
def __init__(self, regex_pattern: str = SMI_REGEX_PATTERN):
|
274 |
+
"""Constructs a BasicSMILESTokenizer.
|
275 |
+
|
276 |
+
Parameters
|
277 |
+
----------
|
278 |
+
regex: string
|
279 |
+
SMILES token regex
|
280 |
+
"""
|
281 |
+
self.regex_pattern = regex_pattern
|
282 |
+
self.regex = re.compile(self.regex_pattern)
|
283 |
+
|
284 |
+
def tokenize(self, text):
|
285 |
+
"""Basic Tokenization of a SMILES.
|
286 |
+
"""
|
287 |
+
tokens = [token for token in self.regex.findall(text)]
|
288 |
+
return tokens
|
289 |
+
|
290 |
+
|
291 |
+
def load_vocab(vocab_file):
|
292 |
+
"""Loads a vocabulary file into a dictionary."""
|
293 |
+
vocab = collections.OrderedDict()
|
294 |
+
with open(vocab_file, "r", encoding="utf-8") as reader:
|
295 |
+
tokens = reader.readlines()
|
296 |
+
for index, token in enumerate(tokens):
|
297 |
+
token = token.rstrip("\n")
|
298 |
+
vocab[token] = index
|
299 |
+
return vocab
|
deepscreen/data/single_entity.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from itertools import product
|
2 |
+
from numbers import Number
|
3 |
+
from pathlib import Path
|
4 |
+
from typing import Any, Dict, Optional, Sequence, Union, Literal
|
5 |
+
|
6 |
+
# import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
from lightning import LightningDataModule
|
9 |
+
from sklearn.base import TransformerMixin
|
10 |
+
from torch.utils.data import Dataset, DataLoader, random_split
|
11 |
+
|
12 |
+
from deepscreen.data.utils.dataset import SingleEntitySingleTargetDataset, BaseEntityDataset
|
13 |
+
from deepscreen.data.utils.label import label_transform
|
14 |
+
from deepscreen.data.utils.collator import collate_fn
|
15 |
+
from deepscreen.data.utils.sampler import SafeBatchSampler
|
16 |
+
|
17 |
+
|
18 |
+
class EntityDataModule(LightningDataModule):
|
19 |
+
"""
|
20 |
+
DTI DataModule
|
21 |
+
|
22 |
+
A DataModule implements 5 key methods:
|
23 |
+
|
24 |
+
def prepare_data(self):
|
25 |
+
# things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
|
26 |
+
# download data, pre-process, split, save to disk, etc.
|
27 |
+
def setup(self, stage):
|
28 |
+
# things to do on every process in DDP
|
29 |
+
# load data, set variables, etc.
|
30 |
+
def train_dataloader(self):
|
31 |
+
# return train dataloader
|
32 |
+
def val_dataloader(self):
|
33 |
+
# return validation dataloader
|
34 |
+
def test_dataloader(self):
|
35 |
+
# return test dataloader
|
36 |
+
def teardown(self):
|
37 |
+
# called on every process in DDP
|
38 |
+
# clean up after fit or test
|
39 |
+
|
40 |
+
This allows you to share a full dataset without explaining how to download,
|
41 |
+
split, transform and process the data.
|
42 |
+
|
43 |
+
Read the docs:
|
44 |
+
https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
|
45 |
+
"""
|
46 |
+
|
47 |
+
def __init__(
|
48 |
+
self,
|
49 |
+
dataset: type[BaseEntityDataset],
|
50 |
+
task: Literal['regression', 'binary', 'multiclass'],
|
51 |
+
n_classes: Optional[int],
|
52 |
+
train: bool,
|
53 |
+
batch_size: int,
|
54 |
+
num_workers: int = 0,
|
55 |
+
thresholds: Optional[Union[Number, Sequence[Number]]] = None,
|
56 |
+
pin_memory: bool = False,
|
57 |
+
data_dir: str = "data/",
|
58 |
+
data_file: Optional[str] = None,
|
59 |
+
train_val_test_split: Optional[Sequence[Number], Sequence[str]] = None,
|
60 |
+
split: Optional[callable] = random_split,
|
61 |
+
):
|
62 |
+
super().__init__()
|
63 |
+
data_path = Path(data_dir) / data_file
|
64 |
+
# this line allows to access init params with 'self.hparams' attribute
|
65 |
+
# also ensures init params will be stored in ckpt
|
66 |
+
self.save_hyperparameters(logger=False)
|
67 |
+
|
68 |
+
# data processing
|
69 |
+
self.split = split
|
70 |
+
|
71 |
+
if train:
|
72 |
+
if all([data_file, split]):
|
73 |
+
if all(isinstance(split, Number) for split in train_val_test_split):
|
74 |
+
pass
|
75 |
+
else:
|
76 |
+
raise ValueError('`train_val_test_split` must be a sequence of 3 numbers '
|
77 |
+
'(float for percentages and int for sample numbers) if '
|
78 |
+
'`data_file` and `split` have been specified.')
|
79 |
+
elif all(isinstance(split, str) for split in train_val_test_split) and not any([data_file, split]):
|
80 |
+
self.train_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[0]))
|
81 |
+
self.val_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[1]))
|
82 |
+
self.test_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[2]))
|
83 |
+
else:
|
84 |
+
raise ValueError('For training (train=True), you must specify either '
|
85 |
+
'`dataset_name` and `split` with `train_val_test_split` of 3 numbers or '
|
86 |
+
'solely `train_val_test_split` of 3 data file names.')
|
87 |
+
else:
|
88 |
+
if data_file and not any([split, train_val_test_split]):
|
89 |
+
self.test_data = self.predict_data = dataset(dataset_path=str(Path(data_dir) / data_file))
|
90 |
+
else:
|
91 |
+
raise ValueError("For testing/predicting (train=False), you must specify only `data_file` without "
|
92 |
+
"`train_val_test_split` or `split`")
|
93 |
+
|
94 |
+
def prepare_data(self):
|
95 |
+
"""
|
96 |
+
Download data if needed.
|
97 |
+
Do not use it to assign state (e.g., self.x = x).
|
98 |
+
"""
|
99 |
+
|
100 |
+
def setup(self, stage: Optional[str] = None, encoding: str = None):
|
101 |
+
"""
|
102 |
+
Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
|
103 |
+
This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
|
104 |
+
careful not to execute data splitting twice.
|
105 |
+
"""
|
106 |
+
# load and split datasets only if not loaded in initialization
|
107 |
+
if not any([self.data_train, self.data_val, self.data_test, self.data_predict]):
|
108 |
+
dataset = SingleEntitySingleTargetDataset(
|
109 |
+
task=self.hparams.task,
|
110 |
+
n_classes=self.hparams.n_classes,
|
111 |
+
dataset_path=Path(self.hparams.data_dir) / self.hparams.dataset_name,
|
112 |
+
transformer=self.hparams.transformer,
|
113 |
+
featurizer=self.hparams.featurizer,
|
114 |
+
thresholds=self.hparams.thresholds,
|
115 |
+
)
|
116 |
+
|
117 |
+
if self.hparams.train:
|
118 |
+
self.data_train, self.data_val, self.data_test = self.split(
|
119 |
+
dataset=dataset,
|
120 |
+
lengths=self.hparams.train_val_test_split
|
121 |
+
)
|
122 |
+
else:
|
123 |
+
self.data_test = self.data_predict = dataset
|
124 |
+
|
125 |
+
def train_dataloader(self):
|
126 |
+
return DataLoader(
|
127 |
+
dataset=self.data_train,
|
128 |
+
batch_sampler=SafeBatchSampler(
|
129 |
+
data_source=self.data_train,
|
130 |
+
batch_size=self.hparams.batch_size,
|
131 |
+
shuffle=True),
|
132 |
+
# batch_size=self.hparams.batch_size,
|
133 |
+
# shuffle=True,
|
134 |
+
num_workers=self.hparams.num_workers,
|
135 |
+
pin_memory=self.hparams.pin_memory,
|
136 |
+
collate_fn=collate_fn,
|
137 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
138 |
+
)
|
139 |
+
|
140 |
+
def val_dataloader(self):
|
141 |
+
return DataLoader(
|
142 |
+
dataset=self.data_val,
|
143 |
+
batch_sampler=SafeBatchSampler(
|
144 |
+
data_source=self.data_val,
|
145 |
+
batch_size=self.hparams.batch_size,
|
146 |
+
shuffle=False),
|
147 |
+
# batch_size=self.hparams.batch_size,
|
148 |
+
# shuffle=False,
|
149 |
+
num_workers=self.hparams.num_workers,
|
150 |
+
pin_memory=self.hparams.pin_memory,
|
151 |
+
collate_fn=collate_fn,
|
152 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
153 |
+
)
|
154 |
+
|
155 |
+
def test_dataloader(self):
|
156 |
+
return DataLoader(
|
157 |
+
dataset=self.data_test,
|
158 |
+
batch_sampler=SafeBatchSampler(
|
159 |
+
data_source=self.data_test,
|
160 |
+
batch_size=self.hparams.batch_size,
|
161 |
+
shuffle=False),
|
162 |
+
# batch_size=self.hparams.batch_size,
|
163 |
+
# shuffle=False,
|
164 |
+
num_workers=self.hparams.num_workers,
|
165 |
+
pin_memory=self.hparams.pin_memory,
|
166 |
+
collate_fn=collate_fn,
|
167 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
168 |
+
)
|
169 |
+
|
170 |
+
def predict_dataloader(self):
|
171 |
+
return DataLoader(
|
172 |
+
dataset=self.data_predict,
|
173 |
+
batch_sampler=SafeBatchSampler(
|
174 |
+
data_source=self.data_predict,
|
175 |
+
batch_size=self.hparams.batch_size,
|
176 |
+
shuffle=False),
|
177 |
+
# batch_size=self.hparams.batch_size,
|
178 |
+
# shuffle=False,
|
179 |
+
num_workers=self.hparams.num_workers,
|
180 |
+
pin_memory=self.hparams.pin_memory,
|
181 |
+
collate_fn=collate_fn,
|
182 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
183 |
+
)
|
184 |
+
|
185 |
+
def teardown(self, stage: Optional[str] = None):
|
186 |
+
"""Clean up after fit or test."""
|
187 |
+
pass
|
188 |
+
|
189 |
+
def state_dict(self):
|
190 |
+
"""Extra things to save to checkpoint."""
|
191 |
+
return {}
|
192 |
+
|
193 |
+
def load_state_dict(self, state_dict: Dict[str, Any]):
|
194 |
+
"""Things to do when loading checkpoint."""
|
195 |
+
pass
|
deepscreen/data/utils/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Sequence, TypeVar, Union
|
2 |
+
|
3 |
+
from deepscreen.data.utils.collator import collate_fn
|
4 |
+
from deepscreen.data.utils.label import label_transform
|
5 |
+
from deepscreen.data.utils.sampler import SafeBatchSampler
|
6 |
+
|
7 |
+
T = TypeVar('T')
|
8 |
+
FlexibleIterable = Union[T, Sequence[T], Dict[str, T]]
|
deepscreen/data/utils/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (701 Bytes). View file
|
|
deepscreen/data/utils/__pycache__/collator.cpython-311.pyc
ADDED
Binary file (4.97 kB). View file
|
|
deepscreen/data/utils/__pycache__/label.cpython-311.pyc
ADDED
Binary file (4.88 kB). View file
|
|
deepscreen/data/utils/__pycache__/sampler.cpython-311.pyc
ADDED
Binary file (3.56 kB). View file
|
|
deepscreen/data/utils/__pycache__/split.cpython-311.pyc
ADDED
Binary file (5.68 kB). View file
|
|
deepscreen/data/utils/collator.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Define collate functions for new data types here
|
3 |
+
"""
|
4 |
+
from functools import partial
|
5 |
+
from itertools import chain
|
6 |
+
|
7 |
+
import dgl
|
8 |
+
import torch
|
9 |
+
from torch.nn.utils.rnn import pad_sequence
|
10 |
+
from torch.utils.data._utils.collate import default_collate_fn_map, collate_tensor_fn, collate
|
11 |
+
import torch_geometric
|
12 |
+
|
13 |
+
|
14 |
+
def collate_pyg_fn(batch, collate_fn_map=None):
|
15 |
+
"""
|
16 |
+
PyG graph collation
|
17 |
+
"""
|
18 |
+
return torch_geometric.data.Batch.from_data_list(batch)
|
19 |
+
|
20 |
+
|
21 |
+
def collate_dgl_fn(batch, collate_fn_map=None):
|
22 |
+
"""
|
23 |
+
DGL graph collation
|
24 |
+
"""
|
25 |
+
return dgl.batch(batch)
|
26 |
+
|
27 |
+
|
28 |
+
def pad_collate_tensor_fn(batch, padding_value=0.0, collate_fn_map=None):
|
29 |
+
"""
|
30 |
+
Similar to pad_packed_sequence(pack_sequence(batch, enforce_sorted=False), batch_first=True),
|
31 |
+
but additionally supports padding a list of square Tensors of size ``(L x L x ...)``.
|
32 |
+
:param batch:
|
33 |
+
:param padding_value:
|
34 |
+
:param collate_fn_map:
|
35 |
+
:return: padded_batch, lengths
|
36 |
+
"""
|
37 |
+
lengths = [tensor.size(0) for tensor in batch]
|
38 |
+
if any(element != lengths[0] for element in lengths[1:]):
|
39 |
+
try:
|
40 |
+
# Tensors share at least one common dimension size, use pad_sequence
|
41 |
+
batch = pad_sequence(batch, batch_first=True, padding_value=padding_value)
|
42 |
+
except RuntimeError:
|
43 |
+
# Tensors do not share any common dimension size, find the max size of each dimension in the batch
|
44 |
+
max_sizes = [max([tensor.size(dim) for tensor in batch]) for dim in range(batch[0].dim())]
|
45 |
+
# Pad every dimension of all tensors in the batch to be the respective max size with the value
|
46 |
+
batch = collate_tensor_fn([
|
47 |
+
torch.nn.functional.pad(
|
48 |
+
tensor, tuple(chain.from_iterable(
|
49 |
+
[(0, max_sizes[dim] - tensor.size(dim)) for dim in range(tensor.dim())][::-1])
|
50 |
+
), mode='constant', value=padding_value) for tensor in batch
|
51 |
+
])
|
52 |
+
else:
|
53 |
+
batch = collate_tensor_fn(batch)
|
54 |
+
|
55 |
+
lengths = torch.as_tensor(lengths)
|
56 |
+
# Return the padded batch tensor and the lengths
|
57 |
+
return batch, lengths
|
58 |
+
|
59 |
+
|
60 |
+
# Join custom collate functions with the default collation map of PyTorch
|
61 |
+
COLLATE_FN_MAP = default_collate_fn_map | {
|
62 |
+
torch_geometric.data.data.BaseData: collate_pyg_fn,
|
63 |
+
dgl.DGLGraph: collate_dgl_fn,
|
64 |
+
}
|
65 |
+
|
66 |
+
|
67 |
+
def collate_fn(batch, automatic_padding=False, padding_value=0):
|
68 |
+
if automatic_padding:
|
69 |
+
COLLATE_FN_MAP.update({
|
70 |
+
torch.Tensor: partial(pad_collate_tensor_fn, padding_value=padding_value),
|
71 |
+
})
|
72 |
+
return collate(batch, collate_fn_map=COLLATE_FN_MAP)
|
73 |
+
|
74 |
+
|
75 |
+
# class VariableLengthSequence(torch.Tensor):
|
76 |
+
# """
|
77 |
+
# A custom PyTorch Tensor class that is similar to PackedSequence, except it can be directly used as a batch tensor,
|
78 |
+
# and it has an attribute called lengths, which signifies the length of each original sequence in the batch.
|
79 |
+
# """
|
80 |
+
#
|
81 |
+
# def __new__(cls, data, lengths):
|
82 |
+
# """
|
83 |
+
# Creates a new VariableLengthSequence object from the given data and lengths.
|
84 |
+
# Args:
|
85 |
+
# data (torch.Tensor): The batch collated tensor of shape (batch_size, max_length, *).
|
86 |
+
# lengths (torch.Tensor): The lengths of each original sequence in the batch of shape (batch_size,).
|
87 |
+
# Returns:
|
88 |
+
# VariableLengthSequence: A new VariableLengthSequence object.
|
89 |
+
# """
|
90 |
+
# # Check the validity of the inputs
|
91 |
+
# assert isinstance(data, torch.Tensor), "data must be a torch.Tensor"
|
92 |
+
# assert isinstance(lengths, torch.Tensor), "lengths must be a torch.Tensor"
|
93 |
+
# assert data.dim() >= 2, "data must have at least two dimensions"
|
94 |
+
# assert lengths.dim() == 1, "lengths must have one dimension"
|
95 |
+
# assert data.size(0) == lengths.size(0), "data and lengths must have the same batch size"
|
96 |
+
# assert lengths.min() > 0, "lengths must be positive"
|
97 |
+
# assert lengths.max() <= data.size(1), "lengths must not exceed the max length of data"
|
98 |
+
#
|
99 |
+
# # Create a new tensor object from data
|
100 |
+
# obj = super().__new__(cls, data)
|
101 |
+
#
|
102 |
+
# # Set the lengths attribute
|
103 |
+
# obj.lengths = lengths
|
104 |
+
#
|
105 |
+
# return obj
|
106 |
+
|
107 |
+
|
108 |
+
# class VariableLengthSequence(torch.Tensor):
|
109 |
+
# _lengths = torch.Tensor()
|
110 |
+
#
|
111 |
+
# def __new__(cls, data, lengths, *args, **kwargs):
|
112 |
+
# self = super().__new__(cls, data, *args, **kwargs)
|
113 |
+
# self.lengths = lengths
|
114 |
+
# return self
|
115 |
+
#
|
116 |
+
# def clone(self, *args, **kwargs):
|
117 |
+
# return VariableLengthSequence(super().clone(*args, **kwargs), self.lengths.clone())
|
118 |
+
#
|
119 |
+
# def new_empty(self, *size):
|
120 |
+
# return VariableLengthSequence(super().new_empty(*size), self.lengths)
|
121 |
+
#
|
122 |
+
# def to(self, *args, **kwargs):
|
123 |
+
# return VariableLengthSequence(super().to(*args, **kwargs), self.lengths.to(*args, **kwargs))
|
124 |
+
#
|
125 |
+
# def __format__(self, format_spec):
|
126 |
+
# # Convert self to a string or a number here, depending on what you need
|
127 |
+
# return self.item().__format__(format_spec)
|
128 |
+
#
|
129 |
+
# @property
|
130 |
+
# def lengths(self):
|
131 |
+
# return self._lengths
|
132 |
+
#
|
133 |
+
# @lengths.setter
|
134 |
+
# def lengths(self, lengths):
|
135 |
+
# self._lengths = lengths
|
136 |
+
#
|
137 |
+
# def cpu(self, *args, **kwargs):
|
138 |
+
# return VariableLengthSequence(super().cpu(*args, **kwargs), self.lengths.cpu(*args, **kwargs))
|
139 |
+
#
|
140 |
+
# def cuda(self, *args, **kwargs):
|
141 |
+
# return VariableLengthSequence(super().cuda(*args, **kwargs), self.lengths.cuda(*args, **kwargs))
|
142 |
+
#
|
143 |
+
# def pin_memory(self):
|
144 |
+
# return VariableLengthSequence(super().pin_memory(), self.lengths.pin_memory())
|
145 |
+
#
|
146 |
+
# def share_memory_(self):
|
147 |
+
# super().share_memory_()
|
148 |
+
# self.lengths.share_memory_()
|
149 |
+
# return self
|
150 |
+
#
|
151 |
+
# def detach_(self, *args, **kwargs):
|
152 |
+
# super().detach_(*args, **kwargs)
|
153 |
+
# self.lengths.detach_(*args, **kwargs)
|
154 |
+
# return self
|
155 |
+
#
|
156 |
+
# def detach(self, *args, **kwargs):
|
157 |
+
# return VariableLengthSequence(super().detach(*args, **kwargs), self.lengths.detach(*args, **kwargs))
|
158 |
+
#
|
159 |
+
# def record_stream(self, *args, **kwargs):
|
160 |
+
# super().record_stream(*args, **kwargs)
|
161 |
+
# self.lengths.record_stream(*args, **kwargs)
|
162 |
+
# return self
|
163 |
+
|
164 |
+
|
165 |
+
# @classmethod
|
166 |
+
# def __torch_function__(cls, func, types, args=(), kwargs=None):
|
167 |
+
# return super().__torch_function__(func, types, args, kwargs) \
|
168 |
+
# if cls.lengths is not None else torch.Tensor.__torch_function__(func, types, args, kwargs)
|
deepscreen/data/utils/dataset.py
ADDED
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from numbers import Number
|
2 |
+
from typing import Literal, Union, Sequence
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
from sklearn.base import TransformerMixin
|
6 |
+
from sklearn.exceptions import NotFittedError
|
7 |
+
from sklearn.utils.validation import check_is_fitted
|
8 |
+
from torch.utils.data import Dataset
|
9 |
+
|
10 |
+
from deepscreen.data.utils import label_transform, FlexibleIterable
|
11 |
+
|
12 |
+
|
13 |
+
class BaseEntityDataset(Dataset):
|
14 |
+
def __init__(
|
15 |
+
self,
|
16 |
+
dataset_path: str,
|
17 |
+
use_col_prefixes=('X', 'Y', 'ID', 'U')
|
18 |
+
):
|
19 |
+
|
20 |
+
# Read the data table header row first to filter columns and create column dtype dict
|
21 |
+
df = pd.read_csv(
|
22 |
+
dataset_path,
|
23 |
+
header=0, nrows=0,
|
24 |
+
usecols=lambda col: col.startswith(use_col_prefixes)
|
25 |
+
)
|
26 |
+
# Read the whole data table
|
27 |
+
df = pd.read_csv(
|
28 |
+
dataset_path,
|
29 |
+
header=0,
|
30 |
+
usecols=df.columns,
|
31 |
+
dtype={col: 'float32' if col.startswith('Y') else 'string' for col in df.columns}
|
32 |
+
)
|
33 |
+
|
34 |
+
self.df = df
|
35 |
+
self.label_cols = [col for col in df.columns if col.startswith('Y')]
|
36 |
+
self.label_unit_cols = [col for col in df.columns if col.startswith('U')]
|
37 |
+
self.entity_id_cols = [col for col in df.columns if col.startswith('ID')]
|
38 |
+
self.entity_cols = [col for col in df.columns if col.startswith('X')]
|
39 |
+
|
40 |
+
def __len__(self):
|
41 |
+
return len(self.df.index)
|
42 |
+
|
43 |
+
def __getitem__(self, idx):
|
44 |
+
raise NotImplementedError
|
45 |
+
|
46 |
+
|
47 |
+
# TODO test transform
|
48 |
+
class SingleEntitySingleTargetDataset(BaseEntityDataset):
|
49 |
+
def __init__(
|
50 |
+
self,
|
51 |
+
dataset_path: str,
|
52 |
+
task: Literal['regression', 'binary', 'multiclass'],
|
53 |
+
n_classes: int,
|
54 |
+
featurizer: callable,
|
55 |
+
transformer: TransformerMixin = None,
|
56 |
+
thresholds: Union[Number, Sequence[Number]] = None,
|
57 |
+
discard_intermediate: bool = None,
|
58 |
+
forward_fill: bool = True
|
59 |
+
):
|
60 |
+
super().__init__(dataset_path)
|
61 |
+
|
62 |
+
assert len(self.entity_cols) == 1, 'The dataset contains more than 1 entity column (starting with `X`).'
|
63 |
+
if len(self.label_cols) >= 0:
|
64 |
+
assert len(self.label_cols) == 1, 'The dataset contains more than 1 label column (starting with `Y`).'
|
65 |
+
# Remove trailing `1`s in column names for flexibility
|
66 |
+
self.df.columns = self.df.columns.str.rstrip('1')
|
67 |
+
|
68 |
+
# Forward-fill non-label columns
|
69 |
+
nonlabel_cols = self.label_unit_cols + self.entity_id_cols + self.entity_cols
|
70 |
+
if forward_fill:
|
71 |
+
self.df[nonlabel_cols] = self.df[nonlabel_cols].ffill(axis=0)
|
72 |
+
|
73 |
+
# Process target labels for training/testing if exist
|
74 |
+
if self.label_cols:
|
75 |
+
# Transform target labels
|
76 |
+
self.df[self.label_cols] = self.df[self.label_cols].apply(
|
77 |
+
label_transform,
|
78 |
+
units=self.df.get('U', None),
|
79 |
+
thresholds=thresholds,
|
80 |
+
discard_intermediate=discard_intermediate).astype('float32')
|
81 |
+
|
82 |
+
# Filter out rows with a NaN in Y (missing values); use inplace to save memory
|
83 |
+
self.df.dropna(subset=self.label_cols, inplace=True)
|
84 |
+
|
85 |
+
# Validate target labels
|
86 |
+
# TODO: check sklearn.utils.multiclass.check_classification_targets
|
87 |
+
match task:
|
88 |
+
case 'regression':
|
89 |
+
assert all(self.df['Y'].apply(lambda x: isinstance(x, Number))), \
|
90 |
+
f"Y for task `regression` must be numeric; got {set(self.df['Y'].apply(type))}."
|
91 |
+
case 'binary':
|
92 |
+
assert all(self.df['Y'].isin([0, 1])), \
|
93 |
+
f"Y for task `binary` (classification) must be 0 or 1, but Y got {pd.unique(self.df['Y'])}." \
|
94 |
+
"\nYou may set `thresholds` to discretize continuous labels."
|
95 |
+
case 'multiclass':
|
96 |
+
assert n_classes >= 3, f'n_classes for task `multiclass` (classification) must be at least 3.'
|
97 |
+
assert all(self.df['Y'].apply(lambda x: x.is_integer() and x >= 0)), \
|
98 |
+
f"``Y` for task `multiclass` (classification) must be non-negative integers, " \
|
99 |
+
f"but `Y` got {pd.unique(self.df['Y'])}." \
|
100 |
+
"\nYou may set `thresholds` to discretize continuous labels."
|
101 |
+
target_n_unique = self.df['Y'].nunique()
|
102 |
+
assert target_n_unique == n_classes, \
|
103 |
+
f"You have set n_classes for task `multiclass` (classification) task to {n_classes}, " \
|
104 |
+
f"but `Y` has {target_n_unique} unique labels."
|
105 |
+
|
106 |
+
if transformer:
|
107 |
+
self.df['X'] = self.df['X'].apply(featurizer)
|
108 |
+
try:
|
109 |
+
check_is_fitted(transformer)
|
110 |
+
self.df['X'] = list(transformer.transform(self.df['X']))
|
111 |
+
except NotFittedError:
|
112 |
+
self.df['X'] = list(transformer.fit_transform(self.df['X']))
|
113 |
+
|
114 |
+
# Skip sample-wise feature extraction because it has already been done dataset-wise
|
115 |
+
self.featurizer = lambda x: x
|
116 |
+
|
117 |
+
self.featurizer = featurizer
|
118 |
+
self.n_classes = n_classes
|
119 |
+
self.df['ID'] = self.df.get('ID', self.df['X'])
|
120 |
+
|
121 |
+
def __getitem__(self, idx):
|
122 |
+
sample = self.df.loc[idx]
|
123 |
+
return {
|
124 |
+
'X': self.featurizer(sample['X']),
|
125 |
+
'ID': sample['ID'],
|
126 |
+
'Y': sample.get('Y')
|
127 |
+
}
|
128 |
+
|
129 |
+
|
130 |
+
# TODO WIP
|
131 |
+
class MultiEntityMultiTargetDataset(BaseEntityDataset):
|
132 |
+
def __init__(
|
133 |
+
self,
|
134 |
+
dataset_path: str,
|
135 |
+
task: FlexibleIterable[Literal['regression', 'binary', 'multiclass']],
|
136 |
+
n_class: FlexibleIterable[int],
|
137 |
+
featurizers: FlexibleIterable[callable],
|
138 |
+
thresholds: FlexibleIterable[Union[Number, Sequence[Number]]] = None,
|
139 |
+
discard_intermediate: FlexibleIterable[bool] = None,
|
140 |
+
):
|
141 |
+
super().__init__(dataset_path)
|
142 |
+
label_col_prefix = tuple('Y')
|
143 |
+
nonlabel_col_prefixes = tuple(('X', 'ID', 'U'))
|
144 |
+
allowed_col_prefixes = label_col_prefix + nonlabel_col_prefixes
|
145 |
+
|
146 |
+
# Read the headers first to filter columns and create column dtype dict
|
147 |
+
df = pd.read_csv(
|
148 |
+
dataset_path,
|
149 |
+
header=0, nrows=0,
|
150 |
+
usecols=lambda col: col.startswith(allowed_col_prefixes)
|
151 |
+
)
|
152 |
+
|
153 |
+
# Read the whole table
|
154 |
+
df = pd.read_csv(
|
155 |
+
dataset_path,
|
156 |
+
header=0,
|
157 |
+
usecols=df.columns,
|
158 |
+
dtype={col: 'float32' if col.startswith('Y') else 'string' for col in df.columns}
|
159 |
+
)
|
160 |
+
label_cols = [col for col in df.columns if col.startswith(label_col_prefix)]
|
161 |
+
nonlabel_cols = [col for col in df.columns if col.startswith(nonlabel_col_prefixes)]
|
162 |
+
self.entity_cols = [col for col in nonlabel_cols if col.startswith('X')]
|
163 |
+
|
164 |
+
# Forward-fill all non-label columns
|
165 |
+
df[nonlabel_cols] = df[nonlabel_cols].ffill(axis=0)
|
166 |
+
|
167 |
+
# Process target labels for training/testing
|
168 |
+
if label_cols:
|
169 |
+
# Transform target labels
|
170 |
+
df[label_cols] = df[label_cols].apply(label_transform, units=df.get('U', None), thresholds=thresholds,
|
171 |
+
discard_intermediate=discard_intermediate).astype('float32')
|
172 |
+
|
173 |
+
# Filter out rows with a NaN in Y (missing values)
|
174 |
+
df.dropna(subset=label_cols, inplace=True)
|
175 |
+
|
176 |
+
# Validate target labels
|
177 |
+
# TODO: check sklearn.utils.multiclass.check_classification_targets
|
178 |
+
# WIP
|
179 |
+
match task:
|
180 |
+
case 'regression':
|
181 |
+
assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
|
182 |
+
f"Y for task `regression` must be numeric; got {set(df['Y'].apply(type))}."
|
183 |
+
case 'binary':
|
184 |
+
assert all(df['Y'].isin([0, 1])), \
|
185 |
+
f"Y for task `binary` must be 0 or 1, but Y got {pd.unique(df['Y'])}." \
|
186 |
+
"\nYou may set `thresholds` to discretize continuous labels."
|
187 |
+
case 'multiclass':
|
188 |
+
assert len(label_cols) == len(n_class), \
|
189 |
+
(f'Data table has {len(label_cols)} label columns (`Y*`) but you have specified '
|
190 |
+
f'n_class of length {len(n_class)} for task `multiclass`.')
|
191 |
+
for label, n in zip(df[label_cols], n_class):
|
192 |
+
assert n >= 3, f'n_class for task `multiclass` must be at least 3.'
|
193 |
+
assert all(label.apply(lambda x: x.is_integer() and x >= 0)), \
|
194 |
+
f"Y for task `multiclass` must be non-negative integers, " \
|
195 |
+
f"but Y got {pd.unique(label)}." \
|
196 |
+
"\nYou may set `thresholds` to discretize continuous labels."
|
197 |
+
target_n_unique = label.nunique()
|
198 |
+
assert target_n_unique == n, \
|
199 |
+
f"You have set n_classes for task `multiclass` task to {n}, " \
|
200 |
+
f"but Y has {target_n_unique} unique labels."
|
201 |
+
|
202 |
+
self.df = df
|
203 |
+
self.featurizers = featurizers
|
204 |
+
self.n_class = n_class
|
205 |
+
|
206 |
+
def __len__(self):
|
207 |
+
return len(self.df.index)
|
208 |
+
|
209 |
+
# WIP
|
210 |
+
def __getitem__(self, idx):
|
211 |
+
sample = self.df.loc[idx]
|
212 |
+
return {
|
213 |
+
'X': [featurizer(x) for featurizer, x in zip(self.featurizers, sample[self.entity_cols])],
|
214 |
+
'ID': sample.get('ID', sample['X']),
|
215 |
+
'Y': sample.get('Y')
|
216 |
+
}
|
deepscreen/data/utils/label.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from numbers import Number
|
2 |
+
from typing import Optional, Union
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
from deepscreen.utils import get_logger
|
7 |
+
|
8 |
+
log = get_logger(__name__)
|
9 |
+
|
10 |
+
MOLARITY_TO_POTENCY = {
|
11 |
+
'p': lambda x: x,
|
12 |
+
'M': lambda x: -np.log10(x),
|
13 |
+
'mM': lambda x: -np.log10(x) + 3,
|
14 |
+
'μM': lambda x: -np.log10(x) + 6,
|
15 |
+
'uM': lambda x: -np.log10(x) + 6, # in case someone doesn't know how to type micromolar lol
|
16 |
+
'nM': lambda x: -np.log10(x) + 9,
|
17 |
+
'pM': lambda x: -np.log10(x) + 12,
|
18 |
+
'fM': lambda x: -np.log10(x) + 15,
|
19 |
+
}
|
20 |
+
|
21 |
+
|
22 |
+
# TODO rewrite for swifter.apply
|
23 |
+
def molar_to_p(labels, units):
|
24 |
+
assert units in MOLARITY_TO_POTENCY, f"Allowed units: {', '.join(MOLARITY_TO_POTENCY)}."
|
25 |
+
|
26 |
+
unit_converted_labels = []
|
27 |
+
for label, unit in (labels, units):
|
28 |
+
unit_converted_labels.append(MOLARITY_TO_POTENCY[unit](label))
|
29 |
+
labels = np.array(unit_converted_labels)
|
30 |
+
|
31 |
+
return labels
|
32 |
+
|
33 |
+
|
34 |
+
def label_discretize(labels, thresholds):
|
35 |
+
# if isinstance(threshold, Number):
|
36 |
+
# labels = np.where(labels < threshold, 1, 0)
|
37 |
+
# else:
|
38 |
+
# labels = np.where(labels < threshold[0], 1, np.where(labels > threshold[1], 0, np.nan))
|
39 |
+
if isinstance(thresholds, Number):
|
40 |
+
labels = 1 - np.digitize(labels, [thresholds])
|
41 |
+
else:
|
42 |
+
labels = np.digitize(labels, np.sort(thresholds)[::-1])
|
43 |
+
|
44 |
+
return labels
|
45 |
+
|
46 |
+
|
47 |
+
def label_transform(
|
48 |
+
labels,
|
49 |
+
units: Optional[list[str]],
|
50 |
+
thresholds: Optional[Union[float, list[Number]]],
|
51 |
+
discard_intermediate: Optional[bool]
|
52 |
+
):
|
53 |
+
f"""Convert labels of all units to p scale (-log10[M]) and binarize them if specified.
|
54 |
+
:param labels: a sequence of labels, continuous or binary values
|
55 |
+
:type labels: array_like
|
56 |
+
:param units: a sequence of label units in {', '.join(MOLARITY_TO_POTENCY)}
|
57 |
+
:type units: array_like, optional
|
58 |
+
:param thresholds: discretization threshold(s) for affinity labels, in p scale (-log10[M]).
|
59 |
+
A single number maps affinities below it to 1 and otherwise to 0.
|
60 |
+
A tuple of two or more thresholds maps affinities to multiple discrete levels descendingly, assigning values
|
61 |
+
values below the lowest threshold to the highest level (e.g. 2) and values above the greatest threshold to 0
|
62 |
+
:type thresholds: list, float, optional
|
63 |
+
:param discard_intermediate: whether to discard the intermediate (indeterminate) level if provided an odd
|
64 |
+
number of thresholds (>=3)
|
65 |
+
:type discard_intermediate: bool
|
66 |
+
:return: a numpy array of affinity labels in p scale (-log10[M]) or discrete labels
|
67 |
+
"""
|
68 |
+
# # Check if labels are already discrete (ignoring NAs).
|
69 |
+
# discrete = labels.dropna().isin([0, 1]).all()
|
70 |
+
#
|
71 |
+
# if discrete:
|
72 |
+
# assert discretize, "Cannot train a regression model with discrete labels."
|
73 |
+
# if thresholds:
|
74 |
+
# warn("Ignoring 'threshold' because 'Y' (labels) in the data table is already binary.")
|
75 |
+
# if units:
|
76 |
+
# warn("Ignoring 'units' because 'Y' (labels) in the data table is already binary.")
|
77 |
+
# labels = labels
|
78 |
+
if units:
|
79 |
+
labels = molar_to_p(labels, units)
|
80 |
+
|
81 |
+
if thresholds:
|
82 |
+
labels = label_discretize(labels, thresholds)
|
83 |
+
if discard_intermediate:
|
84 |
+
assert len(thresholds) % 2 == 1 and len(thresholds) >= 3, \
|
85 |
+
"Must give an odd number of (at least 3) thresholds to discard the intermediate level."
|
86 |
+
intermediate_level = len(thresholds) // 2
|
87 |
+
# Make the intermediate-level labels NaN (which will be filtered out later)
|
88 |
+
labels[labels == intermediate_level] = np.nan
|
89 |
+
# Reduce all levels above the intermediate level by 1
|
90 |
+
labels[labels > intermediate_level] -= 1
|
91 |
+
|
92 |
+
return labels
|
93 |
+
|
deepscreen/data/utils/sampler.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Mapping, Iterable
|
2 |
+
|
3 |
+
from torch.utils.data import BatchSampler, RandomSampler, SequentialSampler
|
4 |
+
|
5 |
+
|
6 |
+
class SafeBatchSampler(BatchSampler):
|
7 |
+
"""
|
8 |
+
A safe `batch_sampler` that skips samples with `None` values, supports shuffling, and keep a fixed batch size.
|
9 |
+
|
10 |
+
Args:
|
11 |
+
data_source (Dataset): The dataset to sample from.
|
12 |
+
batch_size (int): The size of each batch.
|
13 |
+
drop_last (bool): Whether to drop the last batch if its size is smaller than `batch_size`. Defaults to `False`.
|
14 |
+
shuffle (bool, optional): Whether to shuffle the data before sampling. Defaults to `True`.
|
15 |
+
|
16 |
+
Example:
|
17 |
+
>>> dataloader = DataLoader(dataset, batch_sampler=SafeBatchSampler(dataset, batch_size, drop_last, shuffle))
|
18 |
+
"""
|
19 |
+
|
20 |
+
def __init__(self, data_source, batch_size: int, drop_last: bool, shuffle: bool, sampler=None):
|
21 |
+
if not isinstance(batch_size, int) or isinstance(batch_size, bool) or \
|
22 |
+
batch_size <= 0:
|
23 |
+
raise ValueError(f"batch_size should be a positive integer value, but got batch_size={batch_size}")
|
24 |
+
if not isinstance(drop_last, bool):
|
25 |
+
raise ValueError(f"drop_last should be a boolean value, but got drop_last={drop_last}")
|
26 |
+
if sampler:
|
27 |
+
pass
|
28 |
+
elif shuffle:
|
29 |
+
sampler = RandomSampler(data_source) # type: ignore[arg-type]
|
30 |
+
else:
|
31 |
+
sampler = SequentialSampler(data_source) # type: ignore[arg-type]
|
32 |
+
|
33 |
+
super().__init__(sampler, batch_size, drop_last)
|
34 |
+
self.data_source = data_source
|
35 |
+
|
36 |
+
# def __iter__(self):
|
37 |
+
# batch = []
|
38 |
+
# for idx in self.sampler:
|
39 |
+
# sample = self.data_source[idx]
|
40 |
+
# # if isinstance(sample, list | tuple):
|
41 |
+
# # pass
|
42 |
+
# # elif isinstance(sample, dict):
|
43 |
+
# # sample = sample.values()
|
44 |
+
# # elif isinstance(sample, Series):
|
45 |
+
# # sample = sample.values
|
46 |
+
# # else:
|
47 |
+
# # sample = [sample]
|
48 |
+
# if isinstance(sample, (Iterable, Mapping)) and not isinstance(sample, str):
|
49 |
+
# if isinstance(sample, Mapping):
|
50 |
+
# sample = list(sample.values())
|
51 |
+
# else:
|
52 |
+
# sample = [sample]
|
53 |
+
#
|
54 |
+
# if all(v is not None for v in sample):
|
55 |
+
# batch.append(idx)
|
56 |
+
# if len(batch) == self.batch_size:
|
57 |
+
# yield batch
|
58 |
+
# batch = []
|
59 |
+
#
|
60 |
+
# if len(batch) > 0 and not self.drop_last:
|
61 |
+
# yield batch
|
62 |
+
#
|
63 |
+
# if not batch:
|
64 |
+
# raise StopIteration
|
65 |
+
|
66 |
+
def __iter__(self):
|
67 |
+
batch = [0] * self.batch_size
|
68 |
+
idx_in_batch = 0
|
69 |
+
for idx in self.sampler:
|
70 |
+
sample = self.data_source[idx]
|
71 |
+
if isinstance(sample, (Iterable, Mapping)) and not isinstance(sample, str):
|
72 |
+
if isinstance(sample, Mapping):
|
73 |
+
sample = sample.values()
|
74 |
+
else:
|
75 |
+
sample = [sample]
|
76 |
+
|
77 |
+
if all(v is not None for v in sample):
|
78 |
+
batch[idx_in_batch] = idx
|
79 |
+
idx_in_batch += 1
|
80 |
+
if idx_in_batch == self.batch_size:
|
81 |
+
yield batch
|
82 |
+
idx_in_batch = 0
|
83 |
+
batch = [0] * self.batch_size
|
84 |
+
|
85 |
+
if idx_in_batch > 0 and not self.drop_last:
|
86 |
+
yield batch[:idx_in_batch]
|
87 |
+
|
88 |
+
if not any(batch):
|
89 |
+
# raise StopIteration
|
90 |
+
return
|