libokj commited on
Commit
c0ec7e6
·
1 Parent(s): f386218

Upload 110 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. deepscreen/__init__.py +101 -0
  2. deepscreen/__pycache__/__init__.cpython-311.pyc +0 -0
  3. deepscreen/__pycache__/predict.cpython-311.pyc +0 -0
  4. deepscreen/data/__init__.py +0 -0
  5. deepscreen/data/__pycache__/__init__.cpython-311.pyc +0 -0
  6. deepscreen/data/__pycache__/dti.cpython-311.pyc +0 -0
  7. deepscreen/data/dti.py +422 -0
  8. deepscreen/data/dti.py.bak +369 -0
  9. deepscreen/data/dti_datamodule.py +314 -0
  10. deepscreen/data/entity_datamodule.py +167 -0
  11. deepscreen/data/featurizers/__init__.py +0 -0
  12. deepscreen/data/featurizers/__pycache__/__init__.cpython-311.pyc +0 -0
  13. deepscreen/data/featurizers/__pycache__/categorical.cpython-311.pyc +0 -0
  14. deepscreen/data/featurizers/__pycache__/token.cpython-311.pyc +0 -0
  15. deepscreen/data/featurizers/categorical.py +86 -0
  16. deepscreen/data/featurizers/chem.py +48 -0
  17. deepscreen/data/featurizers/fcs.py +67 -0
  18. deepscreen/data/featurizers/fingerprint/__init__.py +45 -0
  19. deepscreen/data/featurizers/fingerprint/atompairs.py +18 -0
  20. deepscreen/data/featurizers/fingerprint/avalonfp.py +16 -0
  21. deepscreen/data/featurizers/fingerprint/estatefp.py +12 -0
  22. deepscreen/data/featurizers/fingerprint/maccskeys.py +25 -0
  23. deepscreen/data/featurizers/fingerprint/maccskeys.xlsx +0 -0
  24. deepscreen/data/featurizers/fingerprint/map4.py +130 -0
  25. deepscreen/data/featurizers/fingerprint/mhfp6.py +18 -0
  26. deepscreen/data/featurizers/fingerprint/mnimalfatures.fdef +53 -0
  27. deepscreen/data/featurizers/fingerprint/morganfp.py +18 -0
  28. deepscreen/data/featurizers/fingerprint/pharmErGfp.py +60 -0
  29. deepscreen/data/featurizers/fingerprint/pharmPointfp.py +59 -0
  30. deepscreen/data/featurizers/fingerprint/pubchemfp.py +1731 -0
  31. deepscreen/data/featurizers/fingerprint/pubchemfp.xlsx +0 -0
  32. deepscreen/data/featurizers/fingerprint/rdkitfp.py +42 -0
  33. deepscreen/data/featurizers/fingerprint/smarts_maccskey.py +178 -0
  34. deepscreen/data/featurizers/fingerprint/smarts_pharmacophore.py +21 -0
  35. deepscreen/data/featurizers/fingerprint/smarts_pubchem.py +734 -0
  36. deepscreen/data/featurizers/fingerprint/torsions.py +18 -0
  37. deepscreen/data/featurizers/graph.py +133 -0
  38. deepscreen/data/featurizers/monn.py +106 -0
  39. deepscreen/data/featurizers/token.py +299 -0
  40. deepscreen/data/single_entity.py +195 -0
  41. deepscreen/data/utils/__init__.py +8 -0
  42. deepscreen/data/utils/__pycache__/__init__.cpython-311.pyc +0 -0
  43. deepscreen/data/utils/__pycache__/collator.cpython-311.pyc +0 -0
  44. deepscreen/data/utils/__pycache__/label.cpython-311.pyc +0 -0
  45. deepscreen/data/utils/__pycache__/sampler.cpython-311.pyc +0 -0
  46. deepscreen/data/utils/__pycache__/split.cpython-311.pyc +0 -0
  47. deepscreen/data/utils/collator.py +168 -0
  48. deepscreen/data/utils/dataset.py +216 -0
  49. deepscreen/data/utils/label.py +93 -0
  50. deepscreen/data/utils/sampler.py +90 -0
deepscreen/__init__.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DeepScreen package initialization, registering custom objects and monkey patching for some libraries.
3
+ """
4
+ import sys
5
+ from builtins import eval
6
+
7
+ import lightning.fabric.strategies.launchers.subprocess_script as subprocess_script
8
+ import torch
9
+ from omegaconf import OmegaConf
10
+
11
+ from deepscreen.utils import get_logger
12
+
13
+ log = get_logger(__name__)
14
+
15
+ # Allow basic Python operations in hydra interpolation; examples:
16
+ # `in_channels: ${eval:${model.drug_encoder.out_channels}+${model.protein_encoder.out_channels}}`
17
+ # `subdir: ${eval:${hydra.job.override_dirname}.replace('/', '.')}`
18
+ OmegaConf.register_new_resolver("eval", eval)
19
+
20
+
21
+ def sanitize_path(path_str: str):
22
+ """
23
+ Sanitize a string for path creation by replacing unsafe characters and cutting length to 255 (OS limitation).
24
+ """
25
+ return path_str.replace("/", ".").replace("\\", ".").replace(":", "-")[:255]
26
+
27
+
28
+ OmegaConf.register_new_resolver("sanitize_path", sanitize_path)
29
+
30
+
31
+ def _hydra_subprocess_cmd(local_rank: int):
32
+ """
33
+ Monkey patching for lightning.fabric.strategies.launchers.subprocess_script._hydra_subprocess_cmd
34
+ Temporarily fixes the problem of unnecessarily creating log folders for DDP subprocesses in Hydra multirun/sweep.
35
+ """
36
+ import __main__ # local import to avoid https://github.com/Lightning-AI/lightning/issues/15218
37
+ from hydra.core.hydra_config import HydraConfig
38
+ from hydra.utils import get_original_cwd, to_absolute_path
39
+
40
+ # when user is using hydra find the absolute path
41
+ if __main__.__spec__ is None: # pragma: no-cover
42
+ command = [sys.executable, to_absolute_path(sys.argv[0])]
43
+ else:
44
+ command = [sys.executable, "-m", __main__.__spec__.name]
45
+
46
+ command += sys.argv[1:]
47
+
48
+ cwd = get_original_cwd()
49
+ rundir = f'"{HydraConfig.get().runtime.output_dir}"'
50
+ # Set output_subdir null since we don't want different subprocesses trying to write to config.yaml
51
+ command += [f"hydra.job.name=train_ddp_process_{local_rank}",
52
+ "hydra.output_subdir=null,"
53
+ f"hydra.runtime.output_dir={rundir}"]
54
+ return command, cwd
55
+
56
+
57
+ subprocess_script._hydra_subprocess_cmd = _hydra_subprocess_cmd
58
+
59
+ # from torch import Tensor
60
+ # from lightning.fabric.utilities.distributed import _distributed_available
61
+ # from lightning.pytorch.utilities.rank_zero import WarningCache
62
+ # from lightning.pytorch.utilities.warnings import PossibleUserWarning
63
+ # from lightning.pytorch.trainer.connectors.logger_connector.result import _ResultCollection
64
+
65
+ # warning_cache = WarningCache()
66
+ #
67
+ # @staticmethod
68
+ # def _get_cache(result_metric, on_step: bool):
69
+ # cache = None
70
+ # if on_step and result_metric.meta.on_step:
71
+ # cache = result_metric._forward_cache
72
+ # elif not on_step and result_metric.meta.on_epoch:
73
+ # if result_metric._computed is None:
74
+ # should = result_metric.meta.sync.should
75
+ # if not should and _distributed_available() and result_metric.is_tensor:
76
+ # warning_cache.warn(
77
+ # f"It is recommended to use `self.log({result_metric.meta.name!r}, ..., sync_dist=True)`"
78
+ # " when logging on epoch level in distributed setting to accumulate the metric across"
79
+ # " devices.",
80
+ # category=PossibleUserWarning,
81
+ # )
82
+ # result_metric.compute()
83
+ # result_metric.meta.sync.should = should
84
+ #
85
+ # cache = result_metric._computed
86
+ #
87
+ # if cache is not None:
88
+ # if isinstance(cache, Tensor):
89
+ # if not result_metric.meta.enable_graph:
90
+ # return cache.detach()
91
+ #
92
+ # return cache
93
+ #
94
+ #
95
+ # _ResultCollection._get_cache = _get_cache
96
+
97
+ if torch.cuda.is_available():
98
+ if torch.cuda.get_device_capability() >= (8, 0):
99
+ torch.set_float32_matmul_precision("high")
100
+ log.info("Your GPU supports tensor cores, "
101
+ "we will enable it automatically by setting `torch.set_float32_matmul_precision('high')`")
deepscreen/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (3.28 kB). View file
 
deepscreen/__pycache__/predict.cpython-311.pyc ADDED
Binary file (3.38 kB). View file
 
deepscreen/data/__init__.py ADDED
File without changes
deepscreen/data/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (179 Bytes). View file
 
deepscreen/data/__pycache__/dti.cpython-311.pyc ADDED
Binary file (23 kB). View file
 
deepscreen/data/dti.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from functools import partial
3
+ from numbers import Number
4
+ from pathlib import Path
5
+ from typing import Any, Dict, Optional, Sequence, Union, Literal
6
+
7
+ from lightning import LightningDataModule
8
+ import pandas as pd
9
+ import swifter
10
+ from sklearn.preprocessing import LabelEncoder
11
+ from torch.utils.data import Dataset, DataLoader
12
+
13
+ from deepscreen.data.utils import label_transform, collate_fn, SafeBatchSampler
14
+ from deepscreen.utils import get_logger
15
+
16
+ log = get_logger(__name__)
17
+
18
+ SMILES_PAT = r"[^A-Za-z0-9=#:+\-\[\]<>()/\\@%,.*]"
19
+ FASTA_PAT = r"[^A-Z*\-]"
20
+
21
+
22
+ def validate_seq_str(seq, regex):
23
+ if seq:
24
+ err_charset = set(re.findall(regex, seq))
25
+ if not err_charset:
26
+ return None
27
+ else:
28
+ return ', '.join(err_charset)
29
+ else:
30
+ return 'Empty string'
31
+
32
+
33
+ # TODO: save a list of corrupted records
34
+
35
+ def rdkit_canonicalize(smiles):
36
+ from rdkit import Chem
37
+ try:
38
+ mol = Chem.MolFromSmiles(smiles)
39
+ cano_smiles = Chem.MolToSmiles(mol)
40
+ return cano_smiles
41
+ except Exception as e:
42
+ log.warning(f'Failed to canonicalize SMILES using RDKIT due to {str(e)}. Returning original SMILES: {smiles}')
43
+ return smiles
44
+
45
+
46
+ class DTIDataset(Dataset):
47
+ def __init__(
48
+ self,
49
+ task: Literal['regression', 'binary', 'multiclass'],
50
+ num_classes: Optional[int],
51
+ data_path: str | Path,
52
+ drug_featurizer: callable,
53
+ protein_featurizer: callable,
54
+ thresholds: Optional[Union[Number, Sequence[Number]]] = None,
55
+ discard_intermediate: Optional[bool] = False,
56
+ query: Optional[str] = 'X2'
57
+ ):
58
+ df = pd.read_csv(
59
+ data_path,
60
+ engine='python',
61
+ header=0,
62
+ usecols=lambda x: x in ['X1', 'ID1', 'X2', 'ID2', 'Y', 'U'],
63
+ dtype={
64
+ 'X1': 'str',
65
+ 'ID1': 'str',
66
+ 'X2': 'str',
67
+ 'ID2': 'str',
68
+ 'Y': 'float32',
69
+ 'U': 'str',
70
+ },
71
+ )
72
+ # Read the whole data table
73
+
74
+ # if 'ID1' in df:
75
+ # self.x1_to_id1 = dict(zip(df['X1'], df['ID1']))
76
+ # if 'ID2' in df:
77
+ # self.x2_to_id2 = dict(zip(df['X2'], df['ID2']))
78
+ # self.id2_to_indexes = dict(zip(df['ID2'], range(len(df['ID2']))))
79
+ # self.x2_to_indexes = dict(zip(df['X2'], range(len(df['X2']))))
80
+
81
+ # # train and eval mode data processing (fully labelled)
82
+ # if 'Y' in df.columns and df['Y'].notnull().all():
83
+ log.info(f"Processing data file: {data_path}")
84
+
85
+ # Forward-fill all non-label columns
86
+ df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
87
+
88
+ # TODO potentially allow running through the whole data validation process
89
+ # error = False
90
+
91
+ if 'Y' in df:
92
+ log.info(f"Validating labels (`Y`)...")
93
+ # TODO: check sklearn.utils.multiclass.check_classification_targets
94
+ match task:
95
+ case 'regression':
96
+ assert all(df['Y'].swifter.apply(lambda x: isinstance(x, Number))), \
97
+ f"""`Y` must be numeric for `regression` task,
98
+ but it has {set(df['Y'].swifter.apply(type))}."""
99
+
100
+ case 'binary':
101
+ if all(df['Y'].isin([0, 1])):
102
+ assert not thresholds, \
103
+ f"""`Y` is already 0 or 1 for `binary` (classification) `task`,
104
+ but still got `thresholds` ({thresholds}).
105
+ Double check your choices of `task` and `thresholds`, and records in the `Y` column."""
106
+ else:
107
+ assert thresholds, \
108
+ f"""`Y` must be 0 or 1 for `binary` (classification) `task`,
109
+ but it has {pd.unique(df['Y'])}.
110
+ You may set `thresholds` to discretize continuous labels.""" # TODO print err idx instead
111
+
112
+ case 'multiclass':
113
+ assert num_classes >= 3, f'`num_classes` for `task=multiclass` must be at least 3.'
114
+
115
+ if all(df['Y'].swifter.apply(lambda x: x.is_integer() and x >= 0)):
116
+ assert not thresholds, \
117
+ f"""`Y` is already non-negative integers for
118
+ `multiclass` (classification) `task`, but still got `thresholds` ({thresholds}).
119
+ Double check your choice of `task`, `thresholds` and records in the `Y` column."""
120
+ else:
121
+ assert thresholds, \
122
+ f"""`Y` must be non-negative integers for
123
+ `multiclass` (classification) 'task',but it has {pd.unique(df['Y'])}.
124
+ You must set `thresholds` to discretize continuous labels.""" # TODO print err idx instead
125
+
126
+ if 'U' in df.columns:
127
+ units = df['U']
128
+ else:
129
+ units = None
130
+ log.warning("Units ('U') not in the data table. "
131
+ "Assuming all labels to be discrete or in p-scale (-log10[M]).")
132
+
133
+ # Transform labels
134
+ df['Y'] = label_transform(labels=df['Y'], units=units, thresholds=thresholds,
135
+ discard_intermediate=discard_intermediate)
136
+
137
+ # Filter out rows with a NaN in Y (missing values)
138
+ df.dropna(subset=['Y'], inplace=True)
139
+
140
+ match task:
141
+ case 'regression':
142
+ df['Y'] = df['Y'].astype('float32')
143
+ assert all(df['Y'].swifter.apply(lambda x: isinstance(x, Number))), \
144
+ f"""`Y` must be numeric for `regression` task,
145
+ but after transformation it still has {set(df['Y'].swifter.apply(type))}.
146
+ Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
147
+ # TODO print err idx instead
148
+ case 'binary':
149
+ df['Y'] = df['Y'].astype('int')
150
+ assert all(df['Y'].isin([0, 1])), \
151
+ f"""`Y` must be 0 or 1 for `task=binary`, "
152
+ but after transformation it still has {pd.unique(df['Y'])}.
153
+ Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
154
+ # TODO print err idx instead
155
+ case 'multiclass':
156
+ df['Y'] = df['Y'].astype('int')
157
+ assert all(df['Y'].swifter.apply(lambda x: x.is_integer() and x >= 0)), \
158
+ f"""Y must be non-negative integers for `task=multiclass`
159
+ but after transformation it still has {pd.unique(df['Y'])}.
160
+ Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
161
+ # TODO print err idx instead
162
+ target_n_unique = df['Y'].nunique()
163
+ assert target_n_unique == num_classes, \
164
+ f"""You have set `num_classes` for `task=multiclass` to {num_classes},
165
+ but after transformation Y still has {target_n_unique} unique labels.
166
+ Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
167
+
168
+ log.info("Validating SMILES (`X1`)...")
169
+ df['X1_ERR'] = df['X1'].swifter.progress_bar(
170
+ desc="Validating SMILES...").apply(validate_seq_str, regex=SMILES_PAT)
171
+ if not df['X1_ERR'].isna().all():
172
+ raise Exception(f"Encountered invalid SMILES:\n{df[~df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
173
+ df['X1^'] = df['X1'].apply(rdkit_canonicalize) # swifter
174
+
175
+ log.info("Validating FASTA (`X2`)...")
176
+ df['X2'] = df['X2'].str.upper()
177
+ df['X2_ERR'] = df['X2'].swifter.progress_bar(
178
+ desc="Validating FASTA...").apply(validate_seq_str, regex=FASTA_PAT)
179
+ if not df['X2_ERR'].isna().all():
180
+ raise Exception(f"Encountered invalid FASTA:\n{df[~df['X2_ERR'].isna()][['X2', 'X2_ERR']]}")
181
+
182
+ # FASTA/SMILES indices as query for retrieval metrics like enrichment factor and hit rate
183
+ if query:
184
+ df['ID^'] = LabelEncoder().fit_transform(df[query])
185
+
186
+ self.df = df
187
+ self.drug_featurizer = drug_featurizer if drug_featurizer is not None else (lambda x: x)
188
+ self.protein_featurizer = protein_featurizer if protein_featurizer is not None else (lambda x: x)
189
+
190
+ def __len__(self):
191
+ return len(self.df.index)
192
+
193
+ def __getitem__(self, i):
194
+ sample = self.df.loc[i]
195
+ return {
196
+ 'N': i,
197
+ 'X1': sample['X1'],
198
+ 'X1^': self.drug_featurizer(sample['X1^']),
199
+ 'ID1': sample.get('ID1'),
200
+ 'X2': sample['X2'],
201
+ 'X2^': self.protein_featurizer(sample['X2']),
202
+ 'ID2': sample.get('ID2'),
203
+ 'Y': sample.get('Y'),
204
+ 'ID^': sample.get('ID^'),
205
+ }
206
+
207
+
208
+ class DTIDataModule(LightningDataModule):
209
+ """
210
+ DTI DataModule
211
+
212
+ A DataModule implements 5 key methods:
213
+
214
+ def prepare_data(self):
215
+ # things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
216
+ # download data, pre-process, split, save to disk, etc.
217
+ def setup(self, stage):
218
+ # things to do on every process in DDP
219
+ # load data, set variables, etc.
220
+ def train_dataloader(self):
221
+ # return train dataloader
222
+ def val_dataloader(self):
223
+ # return validation dataloader
224
+ def test_dataloader(self):
225
+ # return test dataloader
226
+ def teardown(self):
227
+ # called on every process in DDP
228
+ # clean up after fit or test
229
+
230
+ This allows you to share a full dataset without explaining how to download,
231
+ split, transform and process the data.
232
+
233
+ Read the docs:
234
+ https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
235
+ """
236
+
237
+ def __init__(
238
+ self,
239
+ task: Literal['regression', 'binary', 'multiclass'],
240
+ num_classes: Optional[int],
241
+ batch_size: int,
242
+ # train: bool,
243
+ drug_featurizer: callable,
244
+ protein_featurizer: callable,
245
+ collator: callable = collate_fn,
246
+ data_dir: str = "data/",
247
+ data_file: Optional[str] = None,
248
+ train_val_test_split: Optional[Union[Sequence[Number | str]]] = None,
249
+ split: Optional[callable] = None,
250
+ thresholds: Optional[Union[Number, Sequence[Number]]] = None,
251
+ discard_intermediate: Optional[bool] = False,
252
+ num_workers: int = 0,
253
+ pin_memory: bool = False,
254
+ ):
255
+ super().__init__()
256
+
257
+ self.train_data: Optional[Dataset] = None
258
+ self.val_data: Optional[Dataset] = None
259
+ self.test_data: Optional[Dataset] = None
260
+ self.predict_data: Optional[Dataset] = None
261
+ self.split = split
262
+ self.collator = collator
263
+ self.dataset = partial(
264
+ DTIDataset,
265
+ task=task,
266
+ num_classes=num_classes,
267
+ drug_featurizer=drug_featurizer,
268
+ protein_featurizer=protein_featurizer,
269
+ thresholds=thresholds,
270
+ discard_intermediate=discard_intermediate
271
+ )
272
+
273
+ # this line allows to access init params with 'self.hparams' ensures init params will be stored in ckpt
274
+ self.save_hyperparameters(logger=False) # ignore=['split']
275
+
276
+ def prepare_data(self):
277
+ """
278
+ Download data if needed.
279
+ Do not use it to assign state (e.g., self.x = x).
280
+ """
281
+
282
+ def setup(self, stage: Optional[str] = None, encoding: str = None):
283
+ """
284
+ Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
285
+ This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
286
+ careful not to execute data splitting twice.
287
+ """
288
+ # load and split datasets only if not loaded in initialization
289
+ if not any([self.train_data, self.test_data, self.val_data, self.predict_data]):
290
+ if self.hparams.train_val_test_split:
291
+ if len(self.hparams.train_val_test_split) != 3:
292
+ raise ValueError('Length of `train_val_test_split` must be 3. '
293
+ 'Set the second element to None for training without validation. '
294
+ 'Set the third element to None for training without testing.')
295
+
296
+ self.train_data = self.hparams.train_val_test_split[0]
297
+ self.val_data = self.hparams.train_val_test_split[1]
298
+ self.test_data = self.hparams.train_val_test_split[2]
299
+
300
+ if all([self.hparams.data_file, self.split]):
301
+ if all(isinstance(split, Number) or split is None
302
+ for split in self.hparams.train_val_test_split):
303
+ split_data = self.split(
304
+ dataset=self.dataset(data_path=Path(self.hparams.data_dir, self.hparams.data_file)),
305
+ lengths=[split for split in self.hparams.train_val_test_split if split is not None]
306
+ )
307
+ for dataset in ['train_data', 'val_data', 'test_data']:
308
+ if getattr(self, dataset) is not None:
309
+ setattr(self, dataset, split_data.pop(0))
310
+
311
+ else:
312
+ raise ValueError('`train_val_test_split` must be a sequence numbers or None'
313
+ '(float for percentages and int for sample numbers) '
314
+ 'if both `data_file` and `split` have been specified.')
315
+
316
+ elif (all(isinstance(split, str) or split is None
317
+ for split in self.hparams.train_val_test_split)
318
+ and not any([self.hparams.data_file, self.split])):
319
+ for dataset in ['train_data', 'val_data', 'test_data']:
320
+ if getattr(self, dataset) is not None:
321
+ data_path = Path(getattr(self, dataset))
322
+ if not data_path.is_absolute():
323
+ data_path = Path(self.hparams.data_dir, data_path)
324
+ setattr(self, dataset, self.dataset(data_path=data_path))
325
+
326
+ else:
327
+ raise ValueError('For training, you must specify either all of `data_file`, `split`, '
328
+ 'and `train_val_test_split` as a sequence of numbers or '
329
+ 'solely `train_val_test_split` as a sequence of data file paths.')
330
+
331
+ elif self.hparams.data_file and not any([self.split, self.hparams.train_val_test_split]):
332
+ data_path = Path(self.hparams.data_file)
333
+ if not data_path.is_absolute():
334
+ data_path = Path(self.hparams.data_dir, data_path)
335
+ self.test_data = self.predict_data = self.dataset(data_path=data_path)
336
+
337
+ else:
338
+ raise ValueError("For training, you must specify `train_val_test_split`. "
339
+ "For testing/predicting, you must specify only `data_file` without "
340
+ "`train_val_test_split` or `split`.")
341
+
342
+ def train_dataloader(self):
343
+ return DataLoader(
344
+ dataset=self.train_data,
345
+ batch_sampler=SafeBatchSampler(
346
+ data_source=self.train_data,
347
+ batch_size=self.hparams.batch_size,
348
+ # Dropping the last batch prevents problems caused by variable batch sizes in training, e.g.,
349
+ # batch_size=1 in BatchNorm, and shuffling ensures the model be trained on all samples over epochs.
350
+ drop_last=True,
351
+ shuffle=True,
352
+ ),
353
+ # batch_size=self.hparams.batch_size,
354
+ # shuffle=True,
355
+ num_workers=self.hparams.num_workers,
356
+ pin_memory=self.hparams.pin_memory,
357
+ collate_fn=self.collator,
358
+ persistent_workers=True if self.hparams.num_workers > 0 else False
359
+ )
360
+
361
+ def val_dataloader(self):
362
+ return DataLoader(
363
+ dataset=self.val_data,
364
+ batch_sampler=SafeBatchSampler(
365
+ data_source=self.val_data,
366
+ batch_size=self.hparams.batch_size,
367
+ drop_last=False,
368
+ shuffle=False
369
+ ),
370
+ # batch_size=self.hparams.batch_size,
371
+ # shuffle=False,
372
+ num_workers=self.hparams.num_workers,
373
+ pin_memory=self.hparams.pin_memory,
374
+ collate_fn=self.collator,
375
+ persistent_workers=True if self.hparams.num_workers > 0 else False
376
+ )
377
+
378
+ def test_dataloader(self):
379
+ return DataLoader(
380
+ dataset=self.test_data,
381
+ batch_sampler=SafeBatchSampler(
382
+ data_source=self.test_data,
383
+ batch_size=self.hparams.batch_size,
384
+ drop_last=False,
385
+ shuffle=False
386
+ ),
387
+ # batch_size=self.hparams.batch_size,
388
+ # shuffle=False,
389
+ num_workers=self.hparams.num_workers,
390
+ pin_memory=self.hparams.pin_memory,
391
+ collate_fn=self.collator,
392
+ persistent_workers=True if self.hparams.num_workers > 0 else False
393
+ )
394
+
395
+ def predict_dataloader(self):
396
+ return DataLoader(
397
+ dataset=self.predict_data,
398
+ batch_sampler=SafeBatchSampler(
399
+ data_source=self.predict_data,
400
+ batch_size=self.hparams.batch_size,
401
+ drop_last=False,
402
+ shuffle=False
403
+ ),
404
+ # batch_size=self.hparams.batch_size,
405
+ # shuffle=False,
406
+ num_workers=self.hparams.num_workers,
407
+ pin_memory=self.hparams.pin_memory,
408
+ collate_fn=self.collator,
409
+ persistent_workers=True if self.hparams.num_workers > 0 else False
410
+ )
411
+
412
+ def teardown(self, stage: Optional[str] = None):
413
+ """Clean up after fit or test."""
414
+ pass
415
+
416
+ def state_dict(self):
417
+ """Extra things to save to checkpoint."""
418
+ return {}
419
+
420
+ def load_state_dict(self, state_dict: Dict[str, Any]):
421
+ """Things to do when loading checkpoint."""
422
+ pass
deepscreen/data/dti.py.bak ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+ from numbers import Number
3
+ from pathlib import Path
4
+ from typing import Any, Dict, Optional, Sequence, Union, Literal
5
+
6
+ from lightning import LightningDataModule
7
+ import pandas as pd
8
+ from sklearn.preprocessing import LabelEncoder
9
+ from torch.utils.data import Dataset, DataLoader
10
+
11
+ from deepscreen.data.utils import label_transform, collate_fn, SafeBatchSampler
12
+ from deepscreen.utils import get_logger
13
+
14
+ log = get_logger(__name__)
15
+
16
+
17
+ # TODO: save a list of corrupted records
18
+
19
+
20
+ class DTIDataset(Dataset):
21
+ def __init__(
22
+ self,
23
+ task: Literal['regression', 'binary', 'multiclass'],
24
+ n_class: Optional[int],
25
+ data_path: str | Path,
26
+ drug_featurizer: callable,
27
+ protein_featurizer: callable,
28
+ thresholds: Optional[Union[Number, Sequence[Number]]] = None,
29
+ discard_intermediate: Optional[bool] = False,
30
+ ):
31
+ df = pd.read_csv(
32
+ data_path,
33
+ engine='python',
34
+ header=0,
35
+ usecols=lambda x: x in ['X1', 'ID1', 'X2', 'ID2', 'Y', 'U'],
36
+ dtype={
37
+ 'X1': 'str',
38
+ 'ID1': 'str',
39
+ 'X2': 'str',
40
+ 'ID2': 'str',
41
+ 'Y': 'float32',
42
+ 'U': 'str',
43
+ },
44
+ )
45
+ # Read the whole data table
46
+
47
+ # if 'ID1' in df:
48
+ # self.x1_to_id1 = dict(zip(df['X1'], df['ID1']))
49
+ # if 'ID2' in df:
50
+ # self.x2_to_id2 = dict(zip(df['X2'], df['ID2']))
51
+ # self.id2_to_indexes = dict(zip(df['ID2'], range(len(df['ID2']))))
52
+ # self.x2_to_indexes = dict(zip(df['X2'], range(len(df['X2']))))
53
+
54
+ # # train and eval mode data processing (fully labelled)
55
+ # if 'Y' in df.columns and df['Y'].notnull().all():
56
+ log.info(f"Processing data file: {data_path}")
57
+
58
+ # Forward-fill all non-label columns
59
+ df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
60
+
61
+ if 'Y' in df:
62
+ log.info(f"Performing pre-transformation target validation.")
63
+ # TODO: check sklearn.utils.multiclass.check_classification_targets
64
+ match task:
65
+ case 'regression':
66
+ assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
67
+ f"""`Y` must be numeric for `regression` task,
68
+ but it has {set(df['Y'].apply(type))}."""
69
+
70
+ case 'binary':
71
+ if all(df['Y'].isin([0, 1])):
72
+ assert not thresholds, \
73
+ f"""`Y` is already 0 or 1 for `binary` (classification) `task`,
74
+ but still got `thresholds` {thresholds}.
75
+ Double check your choices of `task` and `thresholds` and records in the `Y` column."""
76
+ else:
77
+ assert thresholds, \
78
+ f"""`Y` must be 0 or 1 for `binary` (classification) `task`,
79
+ but it has {pd.unique(df['Y'])}.
80
+ You must set `thresholds` to discretize continuous labels."""
81
+
82
+ case 'multiclass':
83
+ assert n_class >= 3, f'`n_class` for `multiclass` (classification) `task` must be at least 3.'
84
+
85
+ if all(df['Y'].apply(lambda x: x.is_integer() and x >= 0)):
86
+ assert not thresholds, \
87
+ f"""`Y` is already non-negative integers for
88
+ `multiclass` (classification) `task`, but still got `thresholds` {thresholds}.
89
+ Double check your choice of `task`, `thresholds` and records in the `Y` column."""
90
+ else:
91
+ assert thresholds, \
92
+ f"""`Y` must be non-negative integers for
93
+ `multiclass` (classification) 'task',but it has {pd.unique(df['Y'])}.
94
+ You must set `thresholds` to discretize continuous labels."""
95
+
96
+ if 'U' in df.columns:
97
+ units = df['U']
98
+ else:
99
+ units = None
100
+ log.warning("Units ('U') not in the data table. "
101
+ "Assuming all labels to be discrete or in p-scale (-log10[M]).")
102
+
103
+ # Transform labels
104
+ df['Y'] = label_transform(labels=df['Y'], units=units, thresholds=thresholds,
105
+ discard_intermediate=discard_intermediate)
106
+
107
+ # Filter out rows with a NaN in Y (missing values)
108
+ df.dropna(subset=['Y'], inplace=True)
109
+
110
+ log.info(f"Performing post-transformation target validation.")
111
+ match task:
112
+ case 'regression':
113
+ df['Y'] = df['Y'].astype('float32')
114
+ assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
115
+ f"""`Y` must be numeric for `regression` task,
116
+ but after transformation it still has {set(df['Y'].apply(type))}.
117
+ Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
118
+
119
+ case 'binary':
120
+ df['Y'] = df['Y'].astype('int')
121
+ assert all(df['Y'].isin([0, 1])), \
122
+ f"""`Y` must be 0 or 1 for `binary` (classification) `task`, "
123
+ but after transformation it still has {pd.unique(df['Y'])}.
124
+ Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
125
+
126
+ case 'multiclass':
127
+ df['Y'] = df['Y'].astype('int')
128
+ assert all(df['Y'].apply(lambda x: x.is_integer() and x >= 0)), \
129
+ f"""Y must be non-negative integers for task `multiclass` (classification)
130
+ but after transformation it still has {pd.unique(df['Y'])}.
131
+ Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
132
+
133
+ target_n_unique = df['Y'].nunique()
134
+ assert target_n_unique == n_class, \
135
+ f"""You have set `n_class` for `multiclass` (classification) `task` to {n_class},
136
+ but after transformation Y still has {target_n_unique} unique labels.
137
+ Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
138
+
139
+ # Indexed protein/FASTA for retrieval metrics
140
+ df['IDX'] = LabelEncoder().fit_transform(df['X2'])
141
+
142
+ self.df = df
143
+ self.drug_featurizer = drug_featurizer if drug_featurizer is not None else (lambda x: x)
144
+ self.protein_featurizer = protein_featurizer if protein_featurizer is not None else (lambda x: x)
145
+
146
+ def __len__(self):
147
+ return len(self.df.index)
148
+
149
+ def __getitem__(self, i):
150
+ sample = self.df.loc[i]
151
+ return {
152
+ 'N': i,
153
+ 'X1': self.drug_featurizer(sample['X1']),
154
+ 'ID1': sample.get('ID1', sample['X1']),
155
+ 'X2': self.protein_featurizer(sample['X2']),
156
+ 'ID2': sample.get('ID2', sample['X2']),
157
+ 'Y': sample.get('Y'),
158
+ 'IDX': sample['IDX'],
159
+ }
160
+
161
+
162
+ class DTIDataModule(LightningDataModule):
163
+ """
164
+ DTI DataModule
165
+
166
+ A DataModule implements 5 key methods:
167
+
168
+ def prepare_data(self):
169
+ # things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
170
+ # download data, pre-process, split, save to disk, etc.
171
+ def setup(self, stage):
172
+ # things to do on every process in DDP
173
+ # load data, set variables, etc.
174
+ def train_dataloader(self):
175
+ # return train dataloader
176
+ def val_dataloader(self):
177
+ # return validation dataloader
178
+ def test_dataloader(self):
179
+ # return test dataloader
180
+ def teardown(self):
181
+ # called on every process in DDP
182
+ # clean up after fit or test
183
+
184
+ This allows you to share a full dataset without explaining how to download,
185
+ split, transform and process the data.
186
+
187
+ Read the docs:
188
+ https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
189
+ """
190
+
191
+ def __init__(
192
+ self,
193
+ task: Literal['regression', 'binary', 'multiclass'],
194
+ n_class: Optional[int],
195
+ batch_size: int,
196
+ # train: bool,
197
+ drug_featurizer: callable,
198
+ protein_featurizer: callable,
199
+ collator: callable = collate_fn,
200
+ data_dir: str = "data/",
201
+ data_file: Optional[str] = None,
202
+ train_val_test_split: Optional[Union[Sequence[Number | str]]] = None,
203
+ split: Optional[callable] = None,
204
+ thresholds: Optional[Union[Number, Sequence[Number]]] = None,
205
+ discard_intermediate: Optional[bool] = False,
206
+ num_workers: int = 0,
207
+ pin_memory: bool = False,
208
+ ):
209
+ super().__init__()
210
+
211
+ self.train_data: Optional[Dataset] = None
212
+ self.val_data: Optional[Dataset] = None
213
+ self.test_data: Optional[Dataset] = None
214
+ self.predict_data: Optional[Dataset] = None
215
+ self.split = split
216
+ self.collator = collator
217
+ self.dataset = partial(
218
+ DTIDataset,
219
+ task=task,
220
+ n_class=n_class,
221
+ drug_featurizer=drug_featurizer,
222
+ protein_featurizer=protein_featurizer,
223
+ thresholds=thresholds,
224
+ discard_intermediate=discard_intermediate
225
+ )
226
+
227
+ if train_val_test_split:
228
+ # TODO test behavior for trainer.test and predict when this is passed
229
+ if len(train_val_test_split) not in [2, 3]:
230
+ raise ValueError('Length of `train_val_test_split` must be 2 (for training without testing) or 3.')
231
+ if all([data_file, split]):
232
+ if all(isinstance(split, Number) for split in train_val_test_split):
233
+ pass
234
+ else:
235
+ raise ValueError('`train_val_test_split` must be a sequence numbers '
236
+ '(float for percentages and int for sample numbers) '
237
+ 'if both `data_file` and `split` have been specified.')
238
+ elif all(isinstance(split, str) for split in train_val_test_split) and not any([data_file, split]):
239
+ split_paths = []
240
+ for split in train_val_test_split:
241
+ split = Path(split)
242
+ if not split.is_absolute():
243
+ split = Path(data_dir, split)
244
+ split_paths.append(split)
245
+
246
+ self.train_data = self.dataset(data_path=split_paths[0])
247
+ self.val_data = self.dataset(data_path=split_paths[1])
248
+ if len(train_val_test_split) == 3:
249
+ self.test_data = self.dataset(data_path=split_paths[2])
250
+ else:
251
+ raise ValueError('For training, you must specify either `data_file`, `split`, '
252
+ 'and `train_val_test_split` as a sequence of numbers or '
253
+ 'solely `train_val_test_split` as a sequence of data file paths.')
254
+
255
+ elif data_file and not any([split, train_val_test_split]):
256
+ data_file = Path(data_file)
257
+ if not data_file.is_absolute():
258
+ data_file = Path(data_dir, data_file)
259
+ self.test_data = self.predict_data = self.dataset(data_path=data_file)
260
+ else:
261
+ raise ValueError("For training, you must specify `train_val_test_split`. "
262
+ "For testing/predicting, you must specify only `data_file` without "
263
+ "`train_val_test_split` or `split`.")
264
+
265
+ # this line allows to access init params with 'self.hparams' attribute
266
+ # also ensures init params will be stored in ckpt
267
+ self.save_hyperparameters(logger=False) # ignore=['split']
268
+
269
+ def prepare_data(self):
270
+ """
271
+ Download data if needed.
272
+ Do not use it to assign state (e.g., self.x = x).
273
+ """
274
+
275
+ def setup(self, stage: Optional[str] = None, encoding: str = None):
276
+ """
277
+ Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
278
+ This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
279
+ careful not to execute data splitting twice.
280
+ """
281
+ # TODO test SafeBatchSampler (which skips samples with any None without introducing variable batch size)
282
+ # load and split datasets only if not loaded in initialization
283
+ if not any([self.train_data, self.test_data, self.val_data, self.predict_data]):
284
+ self.train_data, self.val_data, self.test_data = self.split(
285
+ dataset=self.dataset(data_path=Path(self.hparams.data_dir, self.hparams.data_file)),
286
+ lengths=self.hparams.train_val_test_split
287
+ )
288
+
289
+ def train_dataloader(self):
290
+ return DataLoader(
291
+ dataset=self.train_data,
292
+ batch_sampler=SafeBatchSampler(
293
+ data_source=self.train_data,
294
+ batch_size=self.hparams.batch_size,
295
+ # Dropping the last batch prevents problems caused by variable batch sizes in training, e.g.,
296
+ # batch_size=1 in BatchNorm, and shuffling ensures the model be trained on all samples over epochs.
297
+ drop_last=True,
298
+ shuffle=True,
299
+ ),
300
+ # batch_size=self.hparams.batch_size,
301
+ # shuffle=True,
302
+ num_workers=self.hparams.num_workers,
303
+ pin_memory=self.hparams.pin_memory,
304
+ collate_fn=self.collator,
305
+ persistent_workers=True if self.hparams.num_workers > 0 else False
306
+ )
307
+
308
+ def val_dataloader(self):
309
+ return DataLoader(
310
+ dataset=self.val_data,
311
+ batch_sampler=SafeBatchSampler(
312
+ data_source=self.val_data,
313
+ batch_size=self.hparams.batch_size,
314
+ drop_last=False,
315
+ shuffle=False
316
+ ),
317
+ # batch_size=self.hparams.batch_size,
318
+ # shuffle=False,
319
+ num_workers=self.hparams.num_workers,
320
+ pin_memory=self.hparams.pin_memory,
321
+ collate_fn=self.collator,
322
+ persistent_workers=True if self.hparams.num_workers > 0 else False
323
+ )
324
+
325
+ def test_dataloader(self):
326
+ return DataLoader(
327
+ dataset=self.test_data,
328
+ batch_sampler=SafeBatchSampler(
329
+ data_source=self.test_data,
330
+ batch_size=self.hparams.batch_size,
331
+ drop_last=False,
332
+ shuffle=False
333
+ ),
334
+ # batch_size=self.hparams.batch_size,
335
+ # shuffle=False,
336
+ num_workers=self.hparams.num_workers,
337
+ pin_memory=self.hparams.pin_memory,
338
+ collate_fn=self.collator,
339
+ persistent_workers=True if self.hparams.num_workers > 0 else False
340
+ )
341
+
342
+ def predict_dataloader(self):
343
+ return DataLoader(
344
+ dataset=self.predict_data,
345
+ batch_sampler=SafeBatchSampler(
346
+ data_source=self.predict_data,
347
+ batch_size=self.hparams.batch_size,
348
+ drop_last=False,
349
+ shuffle=False
350
+ ),
351
+ # batch_size=self.hparams.batch_size,
352
+ # shuffle=False,
353
+ num_workers=self.hparams.num_workers,
354
+ pin_memory=self.hparams.pin_memory,
355
+ collate_fn=self.collator,
356
+ persistent_workers=True if self.hparams.num_workers > 0 else False
357
+ )
358
+
359
+ def teardown(self, stage: Optional[str] = None):
360
+ """Clean up after fit or test."""
361
+ pass
362
+
363
+ def state_dict(self):
364
+ """Extra things to save to checkpoint."""
365
+ return {}
366
+
367
+ def load_state_dict(self, state_dict: Dict[str, Any]):
368
+ """Things to do when loading checkpoint."""
369
+ pass
deepscreen/data/dti_datamodule.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from itertools import product
2
+ from collections import namedtuple
3
+ from numbers import Number
4
+ from typing import Any, Dict, Optional, Sequence, Union, Literal
5
+
6
+ # import numpy as np
7
+ import pandas as pd
8
+ from lightning import LightningDataModule
9
+ from torch.utils.data import Dataset, DataLoader, random_split
10
+
11
+ from deepscreen.data.utils.label import label_transform
12
+ from deepscreen.data.utils.collator import collate_fn
13
+ from deepscreen.data.utils.sampler import SafeBatchSampler
14
+
15
+
16
+ class DTIDataset(Dataset):
17
+ def __init__(
18
+ self,
19
+ task: Literal['regression', 'binary', 'multiclass'],
20
+ n_classes: Optional[int],
21
+ data_dir: str,
22
+ dataset_name: str,
23
+ drug_featurizer: callable,
24
+ protein_featurizer: callable,
25
+ thresholds: Optional[Union[Number, Sequence[Number]]] = None,
26
+ discard_intermediate: Optional[bool] = False,
27
+ ):
28
+ df = pd.read_csv(
29
+ f'{data_dir}{dataset_name}.csv',
30
+ header=0, sep=',',
31
+ usecols=lambda x: x in ['X1', 'ID1', 'X2', 'ID2', 'Y', 'U'],
32
+ dtype={'X1': 'str', 'ID1': 'str',
33
+ 'X2': 'str', 'ID2': 'str',
34
+ 'Y': 'float32', 'U': 'str'}
35
+ )
36
+ # if 'ID1' in df:
37
+ # self.x1_to_id1 = dict(zip(df['X1'], df['ID1']))
38
+ # if 'ID2' in df:
39
+ # self.x2_to_id2 = dict(zip(df['X2'], df['ID2']))
40
+ # self.id2_to_indexes = dict(zip(df['ID2'], range(len(df['ID2']))))
41
+ # self.x2_to_indexes = dict(zip(df['X2'], range(len(df['X2']))))
42
+
43
+ # # train and eval mode data processing (fully labelled)
44
+ # if 'Y' in df.columns and df['Y'].notnull().all():
45
+
46
+ # Forward-fill all non-label columns
47
+ df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
48
+
49
+ if 'Y' in df:
50
+ # Transform labels
51
+ df['Y'] = df['Y'].apply(label_transform, units=df.get('U', None), thresholds=thresholds,
52
+ discard_intermediate=discard_intermediate).astype('float32')
53
+
54
+ # Filter out rows with a NaN in Y (missing values)
55
+ df.dropna(subset=['Y'], inplace=True)
56
+
57
+ # Validate target labels for training/testing
58
+ # TODO: check sklearn.utils.multiclass.check_classification_targets
59
+ match task:
60
+ case 'regression':
61
+ assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
62
+ f"Y for task `regression` must be numeric; got {set(df['Y'].apply(type))}."
63
+ case 'binary':
64
+ assert all(df['Y'].isin([0, 1])), \
65
+ f"Y for task `binary` (classification) must be 0 or 1, but Y got {pd.unique(df['Y'])}." \
66
+ "\nYou may set `thresholds` to discretize continuous labels."
67
+ case 'multiclass':
68
+ assert n_classes >= 3, f'n_classes for task `multiclass` (classification) must be at least 3.'
69
+ assert all(df['Y'].apply(lambda x: x.is_integer() and x >= 0)), \
70
+ f"Y for task `multiclass` (classification) must be non-negative integers, " \
71
+ f"but Y got {pd.unique(df['Y'])}." \
72
+ "\nYou may set `thresholds` to discretize continuous labels."
73
+ target_n_unique = df['Y'].nunique()
74
+ assert target_n_unique == n_classes, \
75
+ f"You have set n_classes for task `multiclass` (classification) task to {n_classes}, " \
76
+ f"but Y has {target_n_unique} unique labels."
77
+
78
+ # # Predict mode data processing
79
+ # else:
80
+ # df = pd.DataFrame(product(df['X1'].dropna(), df['X2'].dropna()), columns=['X1', 'X2'])
81
+ # if hasattr(self, "x1_to_id1"):
82
+ # df['ID1'] = df['X1'].map(self.x1_to_id1)
83
+ # if hasattr(self, "x1_to_id2"):
84
+ # df['ID2'] = df['X2'].map(self.x2_to_id2)
85
+
86
+ # self.smiles = df['X1']
87
+ # self.fasta = df['X2']
88
+ # self.smiles_ids = df.get('ID1', df['X1'])
89
+ # self.fasta_ids = df.get('ID2', df['X2'])
90
+ # self.labels = df.get('Y', None)
91
+
92
+ self.df = df
93
+ self.drug_featurizer = drug_featurizer if drug_featurizer is not None else (lambda x: x)
94
+ self.protein_featurizer = protein_featurizer if protein_featurizer is not None else (lambda x: x)
95
+ self.n_classes = df['Y'].nunique()
96
+ # self.train = train
97
+
98
+ self.Data = namedtuple('Data', ['FT1', 'ID1', 'FT2', 'ID2', 'Y'])
99
+
100
+ def __len__(self):
101
+ return len(self.df.index)
102
+
103
+ def __getitem__(self, idx):
104
+ sample = self.df.loc[idx]
105
+ return self.Data(
106
+ FT1=self.drug_featurizer(sample['X1']),
107
+ ID1=sample.get('ID1', sample['X1']),
108
+ FT2=self.protein_featurizer(sample['X2']),
109
+ ID2=sample.get('ID2', sample['X2']),
110
+ Y=sample.get('Y')
111
+ )
112
+ # {
113
+ # 'FT1': self.drug_featurizer(sample['X1']),
114
+ # 'ID1': sample.get('ID1', sample['X1']),
115
+ # 'FT2': self.protein_featurizer(sample['X2']),
116
+ # 'ID2': sample.get('ID2', sample['X2']),
117
+ # 'Y': sample.get('Y')
118
+ # }
119
+ # if self.train:
120
+ # sample = self.drug_featurizer(self.smiles[idx]), self.protein_featurizer(self.fasta[idx]), self.labels[idx]
121
+ # sample = {
122
+ # 'FT1': self.drug_featurizer(self.smiles[idx]),
123
+ # 'FT2': self.protein_featurizer(self.fasta[idx]),
124
+ # 'ID2': self.smiles_ids[idx],
125
+ # }
126
+ # else:
127
+ # # sample = self.drug_featurizer(self.smiles[idx]), self.protein_featurizer(self.fasta[idx])
128
+ # sample = {
129
+ # 'FT1': self.drug_featurizer(self.smiles[idx]),
130
+ # 'FT2': self.protein_featurizer(self.fasta[idx]),
131
+ # }
132
+ #
133
+ # if all([True if n is not None else False for n in sample.values()]):
134
+ # return sample # | {
135
+ # # 'ID1': self.smiles_ids[idx],
136
+ # # 'X1': self.drug_featurizer(self.smiles[idx]),
137
+ # # 'ID2': self.fasta_ids[idx],
138
+ # # 'X2': self.protein_featurizer(self.fasta[idx]),
139
+ # # }
140
+ # else:
141
+ # return self.__getitem__(np.random.randint(0, self.size))
142
+
143
+
144
+ class DTIdatamodule(LightningDataModule):
145
+ """
146
+ DTI DataModule
147
+
148
+ A DataModule implements 5 key methods:
149
+
150
+ def prepare_data(self):
151
+ # things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
152
+ # download data, pre-process, split, save to disk, etc.
153
+ def setup(self, stage):
154
+ # things to do on every process in DDP
155
+ # load data, set variables, etc.
156
+ def train_dataloader(self):
157
+ # return train dataloader
158
+ def val_dataloader(self):
159
+ # return validation dataloader
160
+ def test_dataloader(self):
161
+ # return test dataloader
162
+ def teardown(self):
163
+ # called on every process in DDP
164
+ # clean up after fit or test
165
+
166
+ This allows you to share a full dataset without explaining how to download,
167
+ split, transform and process the data.
168
+
169
+ Read the docs:
170
+ https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
171
+ """
172
+
173
+ def __init__(
174
+ self,
175
+ task: Literal['regression', 'binary', 'multiclass'],
176
+ n_classes: Optional[int],
177
+ train: bool,
178
+ drug_featurizer: callable,
179
+ protein_featurizer: callable,
180
+ batch_size: int,
181
+ train_val_test_split: Optional[Sequence[Number]],
182
+ num_workers: int = 0,
183
+ thresholds: Optional[Union[Number, Sequence[Number]]] = None,
184
+ pin_memory: bool = False,
185
+ data_dir: str = "data/",
186
+ dataset_name: Optional[str] = None,
187
+ split: Optional[callable] = random_split,
188
+ ):
189
+ super().__init__()
190
+
191
+ # this line allows to access init params with 'self.hparams' attribute
192
+ # also ensures init params will be stored in ckpt
193
+ self.save_hyperparameters(logger=False)
194
+
195
+ # data processing
196
+ self.data_split = split
197
+
198
+ self.data_train: Optional[Dataset] = None
199
+ self.data_val: Optional[Dataset] = None
200
+ self.data_test: Optional[Dataset] = None
201
+ self.data_predict: Optional[Dataset] = None
202
+
203
+ def prepare_data(self):
204
+ """
205
+ Download data if needed.
206
+ Do not use it to assign state (e.g., self.x = x).
207
+ """
208
+
209
+ def setup(self, stage: Optional[str] = None, encoding: str = None):
210
+ """
211
+ Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
212
+ This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
213
+ careful not to execute data splitting twice.
214
+ """
215
+ # TODO test SafeBatchSampler (which skips samples with any None without introducing variable batch size)
216
+ # load and split datasets only if not loaded in initialization
217
+ if not any([self.data_train, self.data_val, self.data_test, self.data_predict]):
218
+ dataset = DTIDataset(
219
+ task=self.hparams.task,
220
+ n_classes=self.hparams.n_classes,
221
+ data_dir=self.hparams.data_dir,
222
+ drug_featurizer=self.hparams.drug_featurizer,
223
+ protein_featurizer=self.hparams.protein_featurizer,
224
+ dataset_name=self.hparams.dataset_name,
225
+ thresholds=self.hparams.thresholds,
226
+ )
227
+
228
+ if self.hparams.train:
229
+ self.data_train, self.data_val, self.data_test = self.data_split(
230
+ dataset=dataset,
231
+ lengths=self.hparams.train_val_test_split
232
+ )
233
+ else:
234
+ self.data_test = self.data_predict = dataset
235
+
236
+ def train_dataloader(self):
237
+ return DataLoader(
238
+ dataset=self.data_train,
239
+ batch_sampler=SafeBatchSampler(
240
+ data_source=self.data_train,
241
+ batch_size=self.hparams.batch_size,
242
+ drop_last=True,
243
+ shuffle=True,
244
+ ),
245
+ # batch_size=self.hparams.batch_size,
246
+ # shuffle=True,
247
+ num_workers=self.hparams.num_workers,
248
+ pin_memory=self.hparams.pin_memory,
249
+ collate_fn=collate_fn,
250
+ persistent_workers=True if self.hparams.num_workers > 0 else False
251
+ )
252
+
253
+ def val_dataloader(self):
254
+ return DataLoader(
255
+ dataset=self.data_val,
256
+ batch_sampler=SafeBatchSampler(
257
+ data_source=self.data_val,
258
+ batch_size=self.hparams.batch_size,
259
+ drop_last=False,
260
+ shuffle=False,
261
+ ),
262
+ # batch_size=self.hparams.batch_size,
263
+ # shuffle=False,
264
+ num_workers=self.hparams.num_workers,
265
+ pin_memory=self.hparams.pin_memory,
266
+ collate_fn=collate_fn,
267
+ persistent_workers=True if self.hparams.num_workers > 0 else False
268
+ )
269
+
270
+ def test_dataloader(self):
271
+ return DataLoader(
272
+ dataset=self.data_test,
273
+ batch_sampler=SafeBatchSampler(
274
+ data_source=self.data_test,
275
+ batch_size=self.hparams.batch_size,
276
+ drop_last=False,
277
+ shuffle=False,
278
+ ),
279
+ # batch_size=self.hparams.batch_size,
280
+ # shuffle=False,
281
+ num_workers=self.hparams.num_workers,
282
+ pin_memory=self.hparams.pin_memory,
283
+ collate_fn=collate_fn,
284
+ persistent_workers=True if self.hparams.num_workers > 0 else False
285
+ )
286
+
287
+ def predict_dataloader(self):
288
+ return DataLoader(
289
+ dataset=self.data_predict,
290
+ batch_sampler=SafeBatchSampler(
291
+ data_source=self.data_predict,
292
+ batch_size=self.hparams.batch_size,
293
+ drop_last=False,
294
+ shuffle=False,
295
+ ),
296
+ # batch_size=self.hparams.batch_size,
297
+ # shuffle=False,
298
+ num_workers=self.hparams.num_workers,
299
+ pin_memory=self.hparams.pin_memory,
300
+ collate_fn=collate_fn,
301
+ persistent_workers=True if self.hparams.num_workers > 0 else False
302
+ )
303
+
304
+ def teardown(self, stage: Optional[str] = None):
305
+ """Clean up after fit or test."""
306
+ pass
307
+
308
+ def state_dict(self):
309
+ """Extra things to save to checkpoint."""
310
+ return {}
311
+
312
+ def load_state_dict(self, state_dict: Dict[str, Any]):
313
+ """Things to do when loading checkpoint."""
314
+ pass
deepscreen/data/entity_datamodule.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from numbers import Number
2
+ from pathlib import Path
3
+ from typing import Any, Dict, Optional, Sequence, Type
4
+
5
+ from lightning import LightningDataModule
6
+ from sklearn.base import TransformerMixin
7
+ from torch.utils.data import Dataset, DataLoader
8
+
9
+ from deepscreen.data.utils import collate_fn, SafeBatchSampler
10
+ from deepscreen.data.utils.dataset import BaseEntityDataset
11
+
12
+
13
+ class EntityDataModule(LightningDataModule):
14
+ """
15
+ def prepare_data(self):
16
+ # things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
17
+ # download data, pre-process, split, save to disk, etc.
18
+ def setup(self, stage):
19
+ # things to do on every process in DDP
20
+ # load data, set variables, etc.
21
+ def train_dataloader(self):
22
+ # return train dataloader
23
+ def val_dataloader(self):
24
+ # return validation dataloader
25
+ def test_dataloader(self):
26
+ # return test dataloader
27
+ def teardown(self):
28
+ # called on every process in DDP
29
+ # clean up after fit or test
30
+ """
31
+ def __init__(
32
+ self,
33
+ dataset: type[BaseEntityDataset],
34
+ transformer: type[TransformerMixin],
35
+ train: bool,
36
+ batch_size: int,
37
+ data_dir: str = "data/",
38
+ data_file: Optional[str] = None,
39
+ train_val_test_split: Optional[Sequence[Number], Sequence[str]] = None,
40
+ split: Optional[callable] = None,
41
+ num_workers: int = 0,
42
+ pin_memory: bool = False,
43
+ ):
44
+ super().__init__()
45
+
46
+ # data processing
47
+ self.split = split
48
+
49
+ if train:
50
+ if all([data_file, split]):
51
+ if all(isinstance(split, Number) for split in train_val_test_split):
52
+ pass
53
+ else:
54
+ raise ValueError('`train_val_test_split` must be a sequence of 3 numbers '
55
+ '(float for percentages and int for sample numbers) if '
56
+ '`data_file` and `split` have been specified.')
57
+ elif all(isinstance(split, str) for split in train_val_test_split) and not any([data_file, split]):
58
+ self.train_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[0]))
59
+ self.val_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[1]))
60
+ self.test_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[2]))
61
+ else:
62
+ raise ValueError('For training (train=True), you must specify either '
63
+ '`dataset_name` and `split` with `train_val_test_split` of 3 numbers or '
64
+ 'solely `train_val_test_split` of 3 data file names.')
65
+ else:
66
+ if data_file and not any([split, train_val_test_split]):
67
+ self.test_data = self.predict_data = dataset(dataset_path=str(Path(data_dir) / data_file))
68
+ else:
69
+ raise ValueError("For testing/predicting (train=False), you must specify only `data_file` without "
70
+ "`train_val_test_split` or `split`")
71
+
72
+ # this line allows to access init params with 'self.hparams' attribute
73
+ # also ensures init params will be stored in ckpt
74
+ self.save_hyperparameters(logger=False)
75
+ def prepare_data(self):
76
+ """
77
+ Download data if needed.
78
+ Do not use it to assign state (e.g., self.x = x).
79
+ """
80
+
81
+ def setup(self, stage: Optional[str] = None, encoding: str = None):
82
+ """
83
+ Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
84
+ This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
85
+ careful not to execute data splitting twice.
86
+ """
87
+ # TODO test SafeBatchSampler (which skips samples with any None without introducing variable batch size)
88
+ # TODO: find a way to apply transformer.fit_transform only to train and transformer.transform only to val, test
89
+ # load and split datasets only if not loaded in initialization
90
+ if not any([self.train_data, self.test_data, self.val_data, self.predict_data]):
91
+ self.train_data, self.val_data, self.test_data = self.split(
92
+ dataset=self.hparams.dataset(data_dir=self.hparams.data_dir,
93
+ dataset_name=self.hparams.train_dataset_name),
94
+ lengths=self.hparams.train_val_test_split
95
+ )
96
+
97
+ def train_dataloader(self):
98
+ return DataLoader(
99
+ dataset=self.train_data,
100
+ batch_sampler=SafeBatchSampler(
101
+ data_source=self.train_data,
102
+ batch_size=self.hparams.batch_size,
103
+ shuffle=True),
104
+ # batch_size=self.hparams.batch_size,
105
+ # shuffle=True,
106
+ num_workers=self.hparams.num_workers,
107
+ pin_memory=self.hparams.pin_memory,
108
+ collate_fn=collate_fn,
109
+ persistent_workers=True if self.hparams.num_workers > 0 else False
110
+ )
111
+
112
+ def val_dataloader(self):
113
+ return DataLoader(
114
+ dataset=self.val_data,
115
+ batch_sampler=SafeBatchSampler(
116
+ data_source=self.val_data,
117
+ batch_size=self.hparams.batch_size,
118
+ shuffle=False),
119
+ # batch_size=self.hparams.batch_size,
120
+ # shuffle=False,
121
+ num_workers=self.hparams.num_workers,
122
+ pin_memory=self.hparams.pin_memory,
123
+ collate_fn=collate_fn,
124
+ persistent_workers=True if self.hparams.num_workers > 0 else False
125
+ )
126
+
127
+ def test_dataloader(self):
128
+ return DataLoader(
129
+ dataset=self.test_data,
130
+ batch_sampler=SafeBatchSampler(
131
+ data_source=self.test_data,
132
+ batch_size=self.hparams.batch_size,
133
+ shuffle=False),
134
+ # batch_size=self.hparams.batch_size,
135
+ # shuffle=False,
136
+ num_workers=self.hparams.num_workers,
137
+ pin_memory=self.hparams.pin_memory,
138
+ collate_fn=collate_fn,
139
+ persistent_workers=True if self.hparams.num_workers > 0 else False
140
+ )
141
+
142
+ def predict_dataloader(self):
143
+ return DataLoader(
144
+ dataset=self.predict_data,
145
+ batch_sampler=SafeBatchSampler(
146
+ data_source=self.predict_data,
147
+ batch_size=self.hparams.batch_size,
148
+ shuffle=False),
149
+ # batch_size=self.hparams.batch_size,
150
+ # shuffle=False,
151
+ num_workers=self.hparams.num_workers,
152
+ pin_memory=self.hparams.pin_memory,
153
+ collate_fn=collate_fn,
154
+ persistent_workers=True if self.hparams.num_workers > 0 else False
155
+ )
156
+
157
+ def teardown(self, stage: Optional[str] = None):
158
+ """Clean up after fit or test."""
159
+ pass
160
+
161
+ def state_dict(self):
162
+ """Extra things to save to checkpoint."""
163
+ return {}
164
+
165
+ def load_state_dict(self, state_dict: Dict[str, Any]):
166
+ """Things to do when loading checkpoint."""
167
+ pass
deepscreen/data/featurizers/__init__.py ADDED
File without changes
deepscreen/data/featurizers/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (191 Bytes). View file
 
deepscreen/data/featurizers/__pycache__/categorical.cpython-311.pyc ADDED
Binary file (5.6 kB). View file
 
deepscreen/data/featurizers/__pycache__/token.cpython-311.pyc ADDED
Binary file (14.9 kB). View file
 
deepscreen/data/featurizers/categorical.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ # Sets of KNOWN characters in SMILES and FASTA sequences
4
+ # Use list instead of set to preserve character order
5
+ SMILES_VOCAB = ('#', '%', ')', '(', '+', '-', '.', '1', '0', '3', '2', '5', '4',
6
+ '7', '6', '9', '8', '=', 'A', 'C', 'B', 'E', 'D', 'G', 'F', 'I',
7
+ 'H', 'K', 'M', 'L', 'O', 'N', 'P', 'S', 'R', 'U', 'T', 'W', 'V',
8
+ 'Y', '[', 'Z', ']', '_', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i',
9
+ 'h', 'm', 'l', 'o', 'n', 's', 'r', 'u', 't', 'y')
10
+ FASTA_VOCAB = ('A', 'C', 'B', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'O',
11
+ 'N', 'Q', 'P', 'S', 'R', 'U', 'T', 'W', 'V', 'Y', 'X', 'Z')
12
+
13
+ # Check uniqueness, create character-index dicts, and add '?' for unknown characters as index 0
14
+ assert len(SMILES_VOCAB) == len(set(SMILES_VOCAB)), 'SMILES_CHARSET has duplicate characters.'
15
+ SMILES_CHARSET_IDX = {character: index+1 for index, character in enumerate(SMILES_VOCAB)} | {'?': 0}
16
+
17
+ assert len(FASTA_VOCAB) == len(set(FASTA_VOCAB)), 'FASTA_CHARSET has duplicate characters.'
18
+ FASTA_CHARSET_IDX = {character: index+1 for index, character in enumerate(FASTA_VOCAB)} | {'?': 0}
19
+
20
+
21
+ def sequence_to_onehot(sequence: str, charset, max_sequence_length: int):
22
+ assert len(charset) == len(set(charset)), '`charset` contains duplicate characters.'
23
+ charset_idx = {character: index+1 for index, character in enumerate(charset)} | {'?': 0}
24
+
25
+ onehot = np.zeros((max_sequence_length, len(charset_idx)), dtype=int)
26
+ for index, character in enumerate(sequence[:max_sequence_length]):
27
+ onehot[index, charset_idx.get(character, 0)] = 1
28
+
29
+ return onehot.transpose()
30
+
31
+
32
+ def sequence_to_label(sequence: str, charset, max_sequence_length: int):
33
+ assert len(charset) == len(set(charset)), '`charset` contains duplicate characters.'
34
+ charset_idx = {character: index+1 for index, character in enumerate(charset)} | {'?': 0}
35
+
36
+ label = np.zeros(max_sequence_length, dtype=int)
37
+ for index, character in enumerate(sequence[:max_sequence_length]):
38
+ label[index] = charset_idx.get(character, 0)
39
+
40
+ return label
41
+
42
+
43
+ def smiles_to_onehot(smiles: str, smiles_charset=SMILES_VOCAB, max_sequence_length: int = 100): # , in_channels: int = len(SMILES_CHARSET)
44
+ # assert len(SMILES_CHARSET) == len(set(SMILES_CHARSET)), 'SMILES_CHARSET has duplicate characters.'
45
+ # onehot = np.zeros((max_sequence_length, len(SMILES_CHARSET_IDX)))
46
+ # for index, character in enumerate(smiles[:max_sequence_length]):
47
+ # onehot[index, SMILES_CHARSET_IDX.get(character, 0)] = 1
48
+ # return onehot.transpose()
49
+ return sequence_to_onehot(smiles, smiles_charset, max_sequence_length)
50
+
51
+
52
+ def smiles_to_label(smiles: str, smiles_charset=SMILES_VOCAB, max_sequence_length: int = 100): # , in_channels: int = len(SMILES_CHARSET)
53
+ # label = np.zeros(max_sequence_length)
54
+ # for index, character in enumerate(smiles[:max_sequence_length]):
55
+ # label[index] = SMILES_CHARSET_IDX.get(character, 0)
56
+ # return label
57
+ return sequence_to_label(smiles, smiles_charset, max_sequence_length)
58
+
59
+
60
+ def fasta_to_onehot(fasta: str, fasta_charset=FASTA_VOCAB, max_sequence_length: int = 1000): # in_channels: int = len(FASTA_CHARSET)
61
+ # onehot = np.zeros((max_sequence_length, len(FASTA_CHARSET_IDX)))
62
+ # for index, character in enumerate(fasta[:max_sequence_length]):
63
+ # onehot[index, FASTA_CHARSET_IDX.get(character, 0)] = 1
64
+ # return onehot.transpose()
65
+ return sequence_to_onehot(fasta, fasta_charset, max_sequence_length)
66
+
67
+
68
+ def fasta_to_label(fasta: str, fasta_charset=FASTA_VOCAB, max_sequence_length: int = 1000): # in_channels: int = len(FASTA_CHARSET)
69
+ # label = np.zeros(max_sequence_length)
70
+ # for index, character in enumerate(fasta[:max_sequence_length]):
71
+ # label[index] = FASTA_CHARSET_IDX.get(character, 0)
72
+ # return label
73
+ return sequence_to_label(fasta, fasta_charset, max_sequence_length)
74
+
75
+
76
+ def one_of_k_encoding(x, allowable_set):
77
+ if x not in allowable_set:
78
+ raise Exception("input {0} not in allowable set{1}:".format(x, allowable_set))
79
+ return list(map(lambda s: x == s, allowable_set))
80
+
81
+
82
+ def one_of_k_encoding_unk(x, allowable_set):
83
+ """Maps inputs not in the allowable set to the last element."""
84
+ if x not in allowable_set:
85
+ x = allowable_set[-1]
86
+ return list(map(lambda s: x == s, allowable_set))
deepscreen/data/featurizers/chem.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Mainly adapted from MolMap:
3
+ https://github.com/shenwanxiang/bidd-molmap/tree/master/molmap/feature/fingerprint
4
+ """
5
+ import numpy as np
6
+ from rdkit import Chem, DataStructs
7
+ from rdkit.Chem import AllChem
8
+ from rdkit.Chem.Fingerprints import FingerprintMols
9
+ from rdkit.Chem.rdReducedGraphs import GetErGFingerprint
10
+
11
+ from deepscreen import get_logger
12
+
13
+ log = get_logger(__name__)
14
+
15
+
16
+ def smiles_to_erg(smiles):
17
+ try:
18
+ mol = Chem.MolFromSmiles(smiles)
19
+ features = np.array(GetErGFingerprint(mol), dtype=bool)
20
+ return features
21
+ except Exception as e:
22
+ log.warning(f"Failed to convert SMILES ({smiles}) to ErGFP due to {str(e)}")
23
+ return None
24
+
25
+
26
+ def smiles_to_morgan(smiles, radius=2, n_bits=1024):
27
+ try:
28
+ mol = Chem.MolFromSmiles(smiles)
29
+ features_vec = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=n_bits)
30
+ features = np.zeros((1,))
31
+ DataStructs.ConvertToNumpyArray(features_vec, features)
32
+ except Exception as e:
33
+ log.warning(f"Failed to convert SMILES ({smiles}) to ErGFP due to {str(e)}")
34
+ return None
35
+
36
+
37
+ def smiles_to_daylight(smiles):
38
+ try:
39
+ NumFinger = 2048
40
+ mol = Chem.MolFromSmiles(smiles)
41
+ bv = FingerprintMols.FingerprintMol(mol)
42
+ temp = tuple(bv.GetOnBits())
43
+ features = np.zeros((NumFinger,))
44
+ features[np.array(temp)] = 1
45
+ except:
46
+ print(f'RDKit could not find this SMILES: {smiles} convert to all 0 features')
47
+ features = np.zeros((2048,))
48
+ return features.astype(int)
deepscreen/data/featurizers/fcs.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from importlib import resources
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from subword_nmt.apply_bpe import BPE
6
+ import codecs
7
+
8
+ vocab_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/protein_codes_uniprot.txt')
9
+ bpe_codes_protein = codecs.open(vocab_path)
10
+ protein_bpe = BPE(bpe_codes_protein, merges=-1, separator='')
11
+
12
+ sub_csv_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/subword_units_map_uniprot.csv')
13
+ sub_csv = pd.read_csv(sub_csv_path)
14
+ idx2word_protein = sub_csv['index'].values
15
+ words2idx_protein = dict(zip(idx2word_protein, range(0, len(idx2word_protein))))
16
+
17
+ vocab_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/drug_codes_chembl.txt')
18
+ bpe_codes_drug = codecs.open(vocab_path)
19
+ drug_bpe = BPE(bpe_codes_drug, merges=-1, separator='')
20
+
21
+ sub_csv_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/subword_units_map_chembl.csv')
22
+ sub_csv = pd.read_csv(sub_csv_path)
23
+ idx2word_drug = sub_csv['index'].values
24
+ words2idx_drug = dict(zip(idx2word_drug, range(0, len(idx2word_drug))))
25
+
26
+
27
+ def protein_to_embedding(x, max_sequence_length):
28
+ max_p = max_sequence_length
29
+ t1 = protein_bpe.process_line(x).split() # split
30
+ try:
31
+ i1 = np.asarray([words2idx_protein[i] for i in t1]) # index
32
+ except:
33
+ i1 = np.array([0])
34
+ # print(x)
35
+
36
+ l = len(i1)
37
+
38
+ if l < max_p:
39
+ i = np.pad(i1, (0, max_p - l), 'constant', constant_values=0)
40
+ input_mask = ([1] * l) + ([0] * (max_p - l))
41
+ else:
42
+ i = i1[:max_p]
43
+ input_mask = [1] * max_p
44
+
45
+ return i, np.asarray(input_mask)
46
+
47
+
48
+ def drug_to_embedding(x, max_sequence_length):
49
+ max_d = max_sequence_length
50
+ t1 = drug_bpe.process_line(x).split() # split
51
+ try:
52
+ i1 = np.asarray([words2idx_drug[i] for i in t1]) # index
53
+ except:
54
+ i1 = np.array([0])
55
+ # print(x)
56
+
57
+ l = len(i1)
58
+
59
+ if l < max_d:
60
+ i = np.pad(i1, (0, max_d - l), 'constant', constant_values=0)
61
+ input_mask = ([1] * l) + ([0] * (max_d - l))
62
+
63
+ else:
64
+ i = i1[:max_d]
65
+ input_mask = [1] * max_d
66
+
67
+ return i, np.asarray(input_mask)
deepscreen/data/featurizers/fingerprint/__init__.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+
3
+ from .atompairs import GetAtomPairFPs
4
+ from .avalonfp import GetAvalonFPs
5
+ from .rdkitfp import GetRDkitFPs
6
+ from .morganfp import GetMorganFPs
7
+ from .estatefp import GetEstateFPs
8
+ from .maccskeys import GetMACCSFPs
9
+ from .pharmErGfp import GetPharmacoErGFPs
10
+ from .pharmPointfp import GetPharmacoPFPs
11
+ from .pubchemfp import GetPubChemFPs
12
+ from .torsions import GetTorsionFPs
13
+ from .mhfp6 import GetMHFP6
14
+ # from .map4 import GetMAP4
15
+ from rdkit import Chem
16
+
17
+ from deepscreen import get_logger
18
+
19
+ log = get_logger(__name__)
20
+
21
+ FP_MAP = {
22
+ 'MorganFP': GetMorganFPs,
23
+ 'RDkitFP': GetRDkitFPs,
24
+ 'AtomPairFP': GetAtomPairFPs,
25
+ 'TorsionFP': GetTorsionFPs,
26
+ 'AvalonFP': GetAvalonFPs,
27
+ 'EstateFP': GetEstateFPs,
28
+ 'MACCSFP': GetMACCSFPs,
29
+ 'PharmacoErGFP': GetPharmacoErGFPs,
30
+ 'PharmacoPFP': GetPharmacoPFPs,
31
+ 'PubChemFP': GetPubChemFPs,
32
+ 'MHFP6': GetMHFP6,
33
+ # 'MAP4': GetMAP4,
34
+ }
35
+
36
+
37
+ def smiles_to_fingerprint(smiles, fingerprint: Literal[tuple(FP_MAP.keys())], **kwargs):
38
+ func = FP_MAP[fingerprint]
39
+ try:
40
+ mol = Chem.MolFromSmiles(smiles)
41
+ arr = func(mol, **kwargs)
42
+ return arr
43
+ except Exception as e:
44
+ log.warning(f"Failed to convert SMILES ({smiles}) to {fingerprint} due to {str(e)}")
45
+ return None
deepscreen/data/featurizers/fingerprint/atompairs.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rdkit.Chem.AtomPairs import Pairs
2
+ from rdkit.Chem import DataStructs
3
+ import numpy as np
4
+
5
+ _type = 'topological-based'
6
+
7
+
8
+ def GetAtomPairFPs(mol, nBits=2048, binary=True):
9
+ '''
10
+ atompairs fingerprints
11
+ '''
12
+ fp = Pairs.GetHashedAtomPairFingerprint(mol, nBits=nBits)
13
+ if binary:
14
+ arr = np.zeros((0,), dtype=np.bool_)
15
+ else:
16
+ arr = np.zeros((0,), dtype=np.int8)
17
+ DataStructs.ConvertToNumpyArray(fp, arr)
18
+ return arr
deepscreen/data/featurizers/fingerprint/avalonfp.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rdkit.Chem import DataStructs
2
+ from rdkit.Avalon.pyAvalonTools import GetAvalonFP as GAFP
3
+ import numpy as np
4
+
5
+ _type = 'topological-based'
6
+
7
+
8
+ def GetAvalonFPs(mol, nBits=2048):
9
+ '''
10
+ Avalon_fingerprints: https://pubs.acs.org/doi/pdf/10.1021/ci050413p
11
+ '''
12
+
13
+ fp = GAFP(mol, nBits=nBits)
14
+ arr = np.zeros((0,), dtype=np.bool_)
15
+ DataStructs.ConvertToNumpyArray(fp, arr)
16
+ return arr
deepscreen/data/featurizers/fingerprint/estatefp.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rdkit.Chem.EState import Fingerprinter
2
+ import numpy as np
3
+
4
+ _type = 'Estate-based'
5
+
6
+
7
+ def GetEstateFPs(mol):
8
+ '''
9
+ 79 bits Estate fps
10
+ '''
11
+ x = Fingerprinter.FingerprintMol(mol)[0]
12
+ return x.astype(np.bool_)
deepscreen/data/featurizers/fingerprint/maccskeys.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rdkit.Chem import AllChem
2
+ from rdkit.Chem import DataStructs
3
+ import numpy as np
4
+ import pandas as pd
5
+ import os
6
+
7
+ _type = 'SMARTS-based'
8
+
9
+ file_path = os.path.dirname(__file__)
10
+
11
+
12
+ def GetMACCSFPs(mol):
13
+ '''
14
+ 166 bits
15
+ '''
16
+
17
+ fp = AllChem.GetMACCSKeysFingerprint(mol)
18
+
19
+ arr = np.zeros((0,), dtype=np.bool_)
20
+ DataStructs.ConvertToNumpyArray(fp, arr)
21
+ return arr
22
+
23
+
24
+ def GetMACCSFPInfos():
25
+ return pd.read_excel(os.path.join(file_path, 'maccskeys.xlsx'))
deepscreen/data/featurizers/fingerprint/maccskeys.xlsx ADDED
Binary file (14 kB). View file
 
deepscreen/data/featurizers/fingerprint/map4.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MinHashed Atom-pair Fingerprint, MAP
3
+ orignal paper: Capecchi, Alice, Daniel Probst, and Jean-Louis Reymond. "One molecular fingerprint to rule them all: drugs, biomolecules, and the metabolome." Journal of Cheminformatics 12.1 (2020): 1-15. orignal code: https://github.com/reymond-group/map4, thanks their orignal work
4
+
5
+ A small bug is fixed: https://github.com/reymond-group/map4/issues/6
6
+ """
7
+
8
+ _type = 'topological-based'
9
+
10
+ import itertools
11
+ from collections import defaultdict
12
+
13
+ import tmap as tm
14
+ from mhfp.encoder import MHFPEncoder
15
+ from rdkit import Chem
16
+ from rdkit.Chem import rdmolops
17
+ from rdkit.Chem.rdmolops import GetDistanceMatrix
18
+
19
+
20
+ def to_smiles(mol):
21
+ return Chem.MolToSmiles(mol, canonical=True, isomericSmiles=False)
22
+
23
+
24
+ class MAP4Calculator:
25
+ def __init__(self, dimensions=2048, radius=2, is_counted=False, is_folded=False, fold_dimensions=2048):
26
+ """
27
+ MAP4 calculator class
28
+ """
29
+ self.dimensions = dimensions
30
+ self.radius = radius
31
+ self.is_counted = is_counted
32
+ self.is_folded = is_folded
33
+ self.fold_dimensions = fold_dimensions
34
+
35
+ if self.is_folded:
36
+ self.encoder = MHFPEncoder(dimensions)
37
+ else:
38
+ self.encoder = tm.Minhash(dimensions)
39
+
40
+ def calculate(self, mol):
41
+ """Calculates the atom pair minhashed fingerprint
42
+ Arguments:
43
+ mol -- rdkit mol object
44
+ Returns:
45
+ tmap VectorUint -- minhashed fingerprint
46
+ """
47
+
48
+ atom_env_pairs = self._calculate(mol)
49
+ if self.is_folded:
50
+ return self._fold(atom_env_pairs)
51
+ return self.encoder.from_string_array(atom_env_pairs)
52
+
53
+ def calculate_many(self, mols):
54
+ """ Calculates the atom pair minhashed fingerprint
55
+ Arguments:
56
+ mols -- list of mols
57
+ Returns:
58
+ list of tmap VectorUint -- minhashed fingerprints list
59
+ """
60
+
61
+ atom_env_pairs_list = [self._calculate(mol) for mol in mols]
62
+ if self.is_folded:
63
+ return [self._fold(pairs) for pairs in atom_env_pairs_list]
64
+ return self.encoder.batch_from_string_array(atom_env_pairs_list)
65
+
66
+ def _calculate(self, mol):
67
+ return self._all_pairs(mol, self._get_atom_envs(mol))
68
+
69
+ def _fold(self, pairs):
70
+ fp_hash = self.encoder.hash(set(pairs))
71
+ return self.encoder.fold(fp_hash, self.fold_dimensions)
72
+
73
+ def _get_atom_envs(self, mol):
74
+ atoms_env = {}
75
+ for atom in mol.GetAtoms():
76
+ idx = atom.GetIdx()
77
+ for radius in range(1, self.radius + 1):
78
+ if idx not in atoms_env:
79
+ atoms_env[idx] = []
80
+ atoms_env[idx].append(MAP4Calculator._find_env(mol, idx, radius))
81
+ return atoms_env
82
+
83
+ @classmethod
84
+ def _find_env(cls, mol, idx, radius):
85
+ env = rdmolops.FindAtomEnvironmentOfRadiusN(mol, radius, idx)
86
+ atom_map = {}
87
+
88
+ submol = Chem.PathToSubmol(mol, env, atomMap=atom_map)
89
+ if idx in atom_map:
90
+ smiles = Chem.MolToSmiles(submol, rootedAtAtom=atom_map[idx], canonical=True, isomericSmiles=False)
91
+ return smiles
92
+ return ''
93
+
94
+ def _all_pairs(self, mol, atoms_env):
95
+ atom_pairs = []
96
+ distance_matrix = GetDistanceMatrix(mol)
97
+ num_atoms = mol.GetNumAtoms()
98
+ shingle_dict = defaultdict(int)
99
+ for idx1, idx2 in itertools.combinations(range(num_atoms), 2):
100
+ dist = str(int(distance_matrix[idx1][idx2]))
101
+
102
+ for i in range(self.radius):
103
+ env_a = atoms_env[idx1][i]
104
+ env_b = atoms_env[idx2][i]
105
+
106
+ ordered = sorted([env_a, env_b])
107
+
108
+ shingle = '{}|{}|{}'.format(ordered[0], dist, ordered[1])
109
+
110
+ if self.is_counted:
111
+ shingle_dict[shingle] += 1
112
+ shingle += '|' + str(shingle_dict[shingle])
113
+
114
+ atom_pairs.append(shingle.encode('utf-8'))
115
+ return list(set(atom_pairs))
116
+
117
+
118
+ def GetMAP4(mol, nBits=2048, radius=2, fold_dimensions=None):
119
+ """
120
+ MAP4: radius=2
121
+ """
122
+ if fold_dimensions == None:
123
+ fold_dimensions = nBits
124
+
125
+ calc = MAP4Calculator(dimensions=nBits, radius=radius, is_counted=False, is_folded=True,
126
+ fold_dimensions=fold_dimensions)
127
+
128
+ arr = calc.calculate(mol)
129
+
130
+ return arr.astype(bool)
deepscreen/data/featurizers/fingerprint/mhfp6.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Probst, Daniel, and Jean-Louis Reymond. "A probabilistic molecular fingerprint for big data settings." Journal of cheminformatics 10.1 (2018): 66.'
3
+
4
+ orignal code: https://github.com/reymond-group/mhfp
5
+
6
+ """
7
+
8
+ from mhfp.encoder import MHFPEncoder
9
+
10
+
11
+ def GetMHFP6(mol, nBits=2048, radius=3):
12
+ """
13
+ MHFP6: radius=3
14
+ """
15
+ encoder = MHFPEncoder(n_permutations=nBits)
16
+ hash_values = encoder.encode_mol(mol, radius=radius, rings=True, kekulize=True, min_radius=1)
17
+ arr = encoder.fold(hash_values, nBits)
18
+ return arr.astype(bool)
deepscreen/data/featurizers/fingerprint/mnimalfatures.fdef ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AtomType NDonor [N&!H0&v3,N&!H0&+1&v4,n&H1&+0]
2
+ AtomType ChalcDonor [O,S;H1;+0]
3
+ DefineFeature SingleAtomDonor [{NDonor},{ChalcDonor},!$([D1]-[C;D3]=[O,S,N])]
4
+ Family Donor
5
+ Weights 1
6
+ EndFeature
7
+
8
+ AtomType NAcceptor [$([N&v3;H1,H2]-[!$(*=[O,N,P,S])])]
9
+ Atomtype NAcceptor [$([N;v3;H0])]
10
+ AtomType NAcceptor [$([n;+0])]
11
+ AtomType ChalcAcceptor [$([O,S;H1;v2]-[!$(*=[O,N,P,S])])]
12
+ AtomType ChalcAcceptor [O,S;H0;v2]
13
+ Atomtype ChalcAcceptor [O,S;-]
14
+ Atomtype ChalcAcceptor [o,s;+0]
15
+ AtomType HalogenAcceptor [F]
16
+ DefineFeature SingleAtomAcceptor [{NAcceptor},{ChalcAcceptor},{HalogenAcceptor}]
17
+ Family Acceptor
18
+ Weights 1
19
+ EndFeature
20
+
21
+ # this one is delightfully easy:
22
+ DefineFeature AcidicGroup [C,S](=[O,S,P])-[O;H1,H0&-1]
23
+ Family NegIonizable
24
+ Weights 1.0,1.0,1.0
25
+ EndFeature
26
+
27
+ AtomType CarbonOrArom_NonCarbonyl [$([C,a]);!$([C,a](=O))]
28
+ AtomType BasicNH2 [$([N;H2&+0][{CarbonOrArom_NonCarbonyl}])]
29
+ AtomType BasicNH1 [$([N;H1&+0]([{CarbonOrArom_NonCarbonyl}])[{CarbonOrArom_NonCarbonyl}])]
30
+ AtomType BasicNH0 [$([N;H0&+0]([{CarbonOrArom_NonCarbonyl}])([{CarbonOrArom_NonCarbonyl}])[{CarbonOrArom_NonCarbonyl}])]
31
+ AtomType BasicNakedN [N,n;X2;+0]
32
+ DefineFeature BasicGroup [{BasicNH2},{BasicNH1},{BasicNH0},{BasicNakedN}]
33
+ Family PosIonizable
34
+ Weights 1.0
35
+ EndFeature
36
+
37
+ # aromatic rings of various sizes:
38
+ DefineFeature Arom5 a1aaaa1
39
+ Family Aromatic
40
+ Weights 1.0,1.0,1.0,1.0,1.0
41
+ EndFeature
42
+ DefineFeature Arom6 a1aaaaa1
43
+ Family Aromatic
44
+ Weights 1.0,1.0,1.0,1.0,1.0,1.0
45
+ EndFeature
46
+ DefineFeature Arom7 a1aaaaaa1
47
+ Family Aromatic
48
+ Weights 1.0,1.0,1.0,1.0,1.0,1.0,1.0
49
+ EndFeature
50
+ DefineFeature Arom8 a1aaaaaaa1
51
+ Family Aromatic
52
+ Weights 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
53
+ EndFeature
deepscreen/data/featurizers/fingerprint/morganfp.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rdkit.Chem import AllChem
2
+ from rdkit.Chem import DataStructs
3
+ import numpy as np
4
+
5
+
6
+ def GetMorganFPs(mol, nBits=2048, radius=2, return_bitInfo=False):
7
+ """
8
+ ECFP4: radius=2
9
+ """
10
+ bitInfo = {}
11
+ fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius,
12
+ bitInfo=bitInfo, nBits=nBits)
13
+ arr = np.zeros((0,), dtype=np.bool_)
14
+ DataStructs.ConvertToNumpyArray(fp, arr)
15
+
16
+ if return_bitInfo:
17
+ return arr, bitInfo
18
+ return arr
deepscreen/data/featurizers/fingerprint/pharmErGfp.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Sat Aug 17 16:54:12 2019
5
+
6
+ @author: wanxiang.shen@u.nus.edu
7
+
8
+ @calculate ErG fps, more info: https://pubs.acs.org/doi/full/10.1021/ci050457y#
9
+ """
10
+
11
+ _type = 'Pharmacophore-based'
12
+
13
+ import numpy as np
14
+ from rdkit.Chem import AllChem
15
+
16
+ ## get info from : https://github.com/rdkit/rdkit/blob/d41752d558bf7200ab67b98cdd9e37f1bdd378de/Code/GraphMol/ReducedGraphs/ReducedGraphs.cpp
17
+ Donor = ["[N;!H0;v3,v4&+1]", "[O,S;H1;+0]", "[n&H1&+0]"]
18
+
19
+ Acceptor = ["[O,S;H1;v2;!$(*-*=[O,N,P,S])]", "[O;H0;v2]", "[O,S;v1;-]",
20
+ "[N;v3;!$(N-*=[O,N,P,S])]", "[n&H0&+0]", "[o;+0;!$([o]:n);!$([o]:c:n)]"]
21
+
22
+ Positive = ["[#7;+]", "[N;H2&+0][$([C,a]);!$([C,a](=O))]",
23
+ "[N;H1&+0]([$([C,a]);!$([C,a](=O))])[$([C,a]);!$([C,a](=O))]",
24
+ "[N;H0&+0]([C;!$(C(=O))])([C;!$(C(=O))])[C;!$(C(=O))]"]
25
+
26
+ Negative = ["[C,S](=[O,S,P])-[O;H1,-1]"]
27
+
28
+ Hydrophobic = ["[C;D3,D4](-[CH3])-[CH3]", "[S;D2](-C)-C"]
29
+
30
+ Aromatic = ["a"]
31
+
32
+ PROPERTY_KEY = ["Donor", "Acceptor", "Positive", "Negative", "Hydrophobic", "Aromatic"]
33
+
34
+
35
+ def GetPharmacoErGFPs(mol, fuzzIncrement=0.3, maxPath=21, binary=True, return_bitInfo=False):
36
+ '''
37
+ https://pubs.acs.org/doi/full/10.1021/ci050457y#
38
+ return maxPath*21 bits
39
+
40
+ size(v) = (n(n + 1)/2) * (maxDist - minDist + 1)
41
+
42
+ '''
43
+ minPath = 1
44
+
45
+ arr = AllChem.GetErGFingerprint(mol, fuzzIncrement=fuzzIncrement, maxPath=maxPath, minPath=minPath)
46
+ arr = arr.astype(np.float32)
47
+
48
+ if binary:
49
+ arr = arr.astype(np.bool_)
50
+
51
+ if return_bitInfo:
52
+ bitInfo = []
53
+ for i in range(len(PROPERTY_KEY)):
54
+ for j in range(i, len(PROPERTY_KEY)):
55
+ for path in range(minPath, maxPath + 1):
56
+ triplet = (PROPERTY_KEY[i], PROPERTY_KEY[j], path)
57
+ bitInfo.append(triplet)
58
+ return arr, bitInfo
59
+
60
+ return arr
deepscreen/data/featurizers/fingerprint/pharmPointfp.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Sat Aug 17 16:54:12 2019
5
+
6
+ @author: wanxiang.shen@u.nus.edu
7
+
8
+ Combining a set of chemical features with the 2D (topological) distances between them gives a 2D pharmacophore. When the distances are binned, unique integer ids can be assigned to each of these pharmacophores and they can be stored in a fingerprint. Details of the encoding are in: https://www.rdkit.org/docs/RDKit_Book.html#ph4-figure
9
+ """
10
+
11
+ _type = 'Pharmacophore-based'
12
+
13
+ from rdkit.Chem.Pharm2D.SigFactory import SigFactory
14
+ from rdkit.Chem.Pharm2D import Generate
15
+ from rdkit.Chem import DataStructs
16
+ from rdkit.Chem import ChemicalFeatures
17
+
18
+ import numpy as np
19
+ import os
20
+
21
+ fdef = os.path.join(os.path.dirname(__file__), 'mnimalfatures.fdef')
22
+ featFactory = ChemicalFeatures.BuildFeatureFactory(fdef)
23
+
24
+
25
+ def GetPharmacoPFPs(mol,
26
+ bins=[(i, i + 1) for i in range(20)],
27
+ minPointCount=2,
28
+ maxPointCount=2,
29
+ return_bitInfo=False):
30
+ '''
31
+ Note: maxPointCont with 3 is slowly
32
+
33
+ bins = [(i,i+1) for i in range(20)],
34
+ maxPonitCount=2 for large-scale computation
35
+
36
+ '''
37
+ MysigFactory = SigFactory(featFactory,
38
+ trianglePruneBins=False,
39
+ minPointCount=minPointCount,
40
+ maxPointCount=maxPointCount)
41
+ MysigFactory.SetBins(bins)
42
+ MysigFactory.Init()
43
+
44
+ res = Generate.Gen2DFingerprint(mol, MysigFactory)
45
+ arr = np.array(list(res)).astype(np.bool_)
46
+ if return_bitInfo:
47
+ description = []
48
+ for i in range(len(res)):
49
+ description.append(MysigFactory.GetBitDescription(i))
50
+ return arr, description
51
+
52
+ return arr
53
+
54
+
55
+ if __name__ == '__main__':
56
+ from rdkit import Chem
57
+
58
+ mol = Chem.MolFromSmiles('CC#CC(=O)NC1=NC=C2C(=C1)C(=NC=N2)NC3=CC(=C(C=C3)F)Cl')
59
+ a = GetPharmacoPFPs(mol, bins=[(i, i + 1) for i in range(20)], minPointCount=2, maxPointCount=2)
deepscreen/data/featurizers/fingerprint/pubchemfp.py ADDED
@@ -0,0 +1,1731 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Sun Aug 25 20:29:36 2019
5
+
6
+ @author: charleshen
7
+
8
+ @Note: The code are copyed from PyBioMed, with a minor repair
9
+
10
+ https://www.ncbi.nlm.nih.gov/pubmed/29556758
11
+
12
+ these are SMARTS patterns corresponding to the PubChem fingerprints
13
+ https://astro.temple.edu/~tua87106/list_fingerprints.pdf
14
+ ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.txt
15
+
16
+ """
17
+
18
+ _type = 'SMARTS-based'
19
+
20
+ import numpy as np
21
+ from rdkit import Chem
22
+ from rdkit import DataStructs
23
+ import os
24
+ import pandas as pd
25
+
26
+ smartsPatts = {
27
+ 1: ('[H]', 3), # 1-115
28
+ 2: ('[H]', 7),
29
+ 3: ('[H]', 15),
30
+ 4: ('[H]', 31),
31
+ 5: ('[Li]', 0),
32
+ 6: ('[Li]', 1),
33
+ 7: ('[B]', 0),
34
+ 8: ('[B]', 1),
35
+ 9: ('[B]', 3),
36
+ 10: ('[C]', 1),
37
+ 11: ('[C]', 3),
38
+ 12: ('[C]', 7),
39
+ 13: ('[C]', 15),
40
+ 14: ('[C]', 31),
41
+ 15: ('[N]', 0),
42
+ 16: ('[N]', 1),
43
+ 17: ('[N]', 3),
44
+ 18: ('[N]', 7),
45
+ 19: ('[O]', 0),
46
+ 20: ('[O]', 1),
47
+ 21: ('[O]', 3),
48
+ 22: ('[O]', 7),
49
+ 23: ('[O]', 15),
50
+ 24: ('[F]', 0),
51
+ 25: ('[F]', 1),
52
+ 26: ('[F]', 3),
53
+ 27: ('[Na]', 0),
54
+ 28: ('[Na]', 1),
55
+ 29: ('[Si]', 0),
56
+ 30: ('[Si]', 1),
57
+ 31: ('[P]', 0),
58
+ 32: ('[P]', 1),
59
+ 33: ('[P]', 3),
60
+ 34: ('[S]', 0),
61
+ 35: ('[S]', 1),
62
+ 36: ('[S]', 3),
63
+ 37: ('[S]', 7),
64
+ 38: ('[Cl]', 0),
65
+ 39: ('[Cl]', 1),
66
+ 40: ('[Cl]', 3),
67
+ 41: ('[Cl]', 7),
68
+ 42: ('[K]', 0),
69
+ 43: ('[K]', 1),
70
+ 44: ('[Br]', 0),
71
+ 45: ('[Br]', 1),
72
+ 46: ('[Br]', 3),
73
+ 47: ('[I]', 0),
74
+ 48: ('[I]', 1),
75
+ 49: ('[I]', 3),
76
+ 50: ('[Be]', 0),
77
+ 51: ('[Mg]', 0),
78
+ 52: ('[Al]', 0),
79
+ 53: ('[Ca]', 0),
80
+ 54: ('[Sc]', 0),
81
+ 55: ('[Ti]', 0),
82
+ 56: ('[V]', 0),
83
+ 57: ('[Cr]', 0),
84
+ 58: ('[Mn]', 0),
85
+ 59: ('[Fe]', 0),
86
+ 60: ('[CO]', 0),
87
+ 61: ('[Ni]', 0),
88
+ 62: ('[Cu]', 0),
89
+ 63: ('[Zn]', 0),
90
+ 64: ('[Ga]', 0),
91
+ 65: ('[Ge]', 0),
92
+ 66: ('[As]', 0),
93
+ 67: ('[Se]', 0),
94
+ 68: ('[Kr]', 0),
95
+ 69: ('[Rb]', 0),
96
+ 70: ('[Sr]', 0),
97
+ 71: ('[Y]', 0),
98
+ 72: ('[Zr]', 0),
99
+ 73: ('[Nb]', 0),
100
+ 74: ('[Mo]', 0),
101
+ 75: ('[Ru]', 0),
102
+ 76: ('[Rh]', 0),
103
+ 77: ('[Pd]', 0),
104
+ 78: ('[Ag]', 0),
105
+ 79: ('[Cd]', 0),
106
+ 80: ('[In]', 0),
107
+ 81: ('[Sn]', 0),
108
+ 82: ('[Sb]', 0),
109
+ 83: ('[Te]', 0),
110
+ 84: ('[Xe]', 0),
111
+ 85: ('[Cs]', 0),
112
+ 86: ('[Ba]', 0),
113
+ 87: ('[Lu]', 0),
114
+ 88: ('[Hf]', 0),
115
+ 89: ('[Ta]', 0),
116
+ 90: ('[W]', 0),
117
+ 91: ('[Re]', 0),
118
+ 92: ('[Os]', 0),
119
+ 93: ('[Ir]', 0),
120
+ 94: ('[Pt]', 0),
121
+ 95: ('[Au]', 0),
122
+ 96: ('[Hg]', 0),
123
+ 97: ('[Tl]', 0),
124
+ 98: ('[Pb]', 0),
125
+ 99: ('[Bi]', 0),
126
+ 100: ('[La]', 0),
127
+ 101: ('[Ce]', 0),
128
+ 102: ('[Pr]', 0),
129
+ 103: ('[Nd]', 0),
130
+ 104: ('[Pm]', 0),
131
+ 105: ('[Sm]', 0),
132
+ 106: ('[Eu]', 0),
133
+ 107: ('[Gd]', 0),
134
+ 108: ('[Tb]', 0),
135
+ 109: ('[Dy]', 0),
136
+ 110: ('[Ho]', 0),
137
+ 111: ('[Er]', 0),
138
+ 112: ('[Tm]', 0),
139
+ 113: ('[Yb]', 0),
140
+ 114: ('[Tc]', 0),
141
+ 115: ('[U]', 0),
142
+ 116: ('[Li&!H0]', 0), # 264-881
143
+ 117: ('[Li]~[Li]', 0),
144
+ 118: ('[Li]~[#5]', 0),
145
+ 119: ('[Li]~[#6]', 0),
146
+ 120: ('[Li]~[#8]', 0),
147
+ 121: ('[Li]~[F]', 0),
148
+ 122: ('[Li]~[#15]', 0),
149
+ 123: ('[Li]~[#16]', 0),
150
+ 124: ('[Li]~[Cl]', 0),
151
+ 125: ('[#5&!H0]', 0),
152
+ 126: ('[#5]~[#5]', 0),
153
+ 127: ('[#5]~[#6]', 0),
154
+ 128: ('[#5]~[#7]', 0),
155
+ 129: ('[#5]~[#8]', 0),
156
+ 130: ('[#5]~[F]', 0),
157
+ 131: ('[#5]~[#14]', 0),
158
+ 132: ('[#5]~[#15]', 0),
159
+ 133: ('[#5]~[#16]', 0),
160
+ 134: ('[#5]~[Cl]', 0),
161
+ 135: ('[#5]~[Br]', 0),
162
+ 136: ('[#6&!H0]', 0),
163
+ 137: ('[#6]~[#6]', 0),
164
+ 138: ('[#6]~[#7]', 0),
165
+ 139: ('[#6]~[#8]', 0),
166
+ 140: ('[#6]~[F]', 0),
167
+ 141: ('[#6]~[Na]', 0),
168
+ 142: ('[#6]~[Mg]', 0),
169
+ 143: ('[#6]~[Al]', 0),
170
+ 144: ('[#6]~[#14]', 0),
171
+ 145: ('[#6]~[#15]', 0),
172
+ 146: ('[#6]~[#16]', 0),
173
+ 147: ('[#6]~[Cl]', 0),
174
+ 148: ('[#6]~[#33]', 0),
175
+ 149: ('[#6]~[#34]', 0),
176
+ 150: ('[#6]~[Br]', 0),
177
+ 151: ('[#6]~[I]', 0),
178
+ 152: ('[#7&!H0]', 0),
179
+ 153: ('[#7]~[#7]', 0),
180
+ 154: ('[#7]~[#8]', 0),
181
+ 155: ('[#7]~[F]', 0),
182
+ 156: ('[#7]~[#14]', 0),
183
+ 157: ('[#7]~[#15]', 0),
184
+ 158: ('[#7]~[#16]', 0),
185
+ 159: ('[#7]~[Cl]', 0),
186
+ 160: ('[#7]~[Br]', 0),
187
+ 161: ('[#8&!H0]', 0),
188
+ 162: ('[#8]~[#8]', 0),
189
+ 163: ('[#8]~[Mg]', 0),
190
+ 164: ('[#8]~[Na]', 0),
191
+ 165: ('[#8]~[Al]', 0),
192
+ 166: ('[#8]~[#14]', 0),
193
+ 167: ('[#8]~[#15]', 0),
194
+ 168: ('[#8]~[K]', 0),
195
+ 169: ('[F]~[#15]', 0),
196
+ 170: ('[F]~[#16]', 0),
197
+ 171: ('[Al&!H0]', 0),
198
+ 172: ('[Al]~[Cl]', 0),
199
+ 173: ('[#14&!H0]', 0),
200
+ 174: ('[#14]~[#14]', 0),
201
+ 175: ('[#14]~[Cl]', 0),
202
+ 176: ('[#15&!H0]', 0),
203
+ 177: ('[#15]~[#15]', 0),
204
+ 178: ('[#33&!H0]', 0),
205
+ 179: ('[#33]~[#33]', 0),
206
+ 180: ('[#6](~Br)(~[#6])', 0),
207
+ 181: ('[#6](~Br)(~[#6])(~[#6])', 0),
208
+ 182: ('[#6&!H0]~[Br]', 0),
209
+ 183: ('[#6](~[Br])(:[c])', 0),
210
+ 184: ('[#6](~[Br])(:[n])', 0),
211
+ 185: ('[#6](~[#6])(~[#6])', 0),
212
+ 186: ('[#6](~[#6])(~[#6])(~[#6])', 0),
213
+ 187: ('[#6](~[#6])(~[#6])(~[#6])(~[#6])', 0),
214
+ 188: ('[#6H1](~[#6])(~[#6])(~[#6])', 0),
215
+ 189: ('[#6](~[#6])(~[#6])(~[#6])(~[#7])', 0),
216
+ 190: ('[#6](~[#6])(~[#6])(~[#6])(~[#8])', 0),
217
+ 191: ('[#6H1](~[#6])(~[#6])(~[#7])', 0),
218
+ 192: ('[#6H1](~[#6])(~[#6])(~[#8])', 0),
219
+ 193: ('[#6](~[#6])(~[#6])(~[#7])', 0),
220
+ 194: ('[#6](~[#6])(~[#6])(~[#8])', 0),
221
+ 195: ('[#6](~[#6])(~[Cl])', 0),
222
+ 196: ('[#6&!H0](~[#6])(~[Cl])', 0),
223
+ 197: ('[#6H,#6H2,#6H3,#6H4]~[#6]', 0),
224
+ 198: ('[#6&!H0](~[#6])(~[#7])', 0),
225
+ 199: ('[#6&!H0](~[#6])(~[#8])', 0),
226
+ 200: ('[#6H1](~[#6])(~[#8])(~[#8])', 0),
227
+ 201: ('[#6&!H0](~[#6])(~[#15])', 0),
228
+ 202: ('[#6&!H0](~[#6])(~[#16])', 0),
229
+ 203: ('[#6](~[#6])(~[I])', 0),
230
+ 204: ('[#6](~[#6])(~[#7])', 0),
231
+ 205: ('[#6](~[#6])(~[#8])', 0),
232
+ 206: ('[#6](~[#6])(~[#16])', 0),
233
+ 207: ('[#6](~[#6])(~[#14])', 0),
234
+ 208: ('[#6](~[#6])(:c)', 0),
235
+ 209: ('[#6](~[#6])(:c)(:c)', 0),
236
+ 210: ('[#6](~[#6])(:c)(:n)', 0),
237
+ 211: ('[#6](~[#6])(:n)', 0),
238
+ 212: ('[#6](~[#6])(:n)(:n)', 0),
239
+ 213: ('[#6](~[Cl])(~[Cl])', 0),
240
+ 214: ('[#6&!H0](~[Cl])', 0),
241
+ 215: ('[#6](~[Cl])(:c)', 0),
242
+ 216: ('[#6](~[F])(~[F])', 0),
243
+ 217: ('[#6](~[F])(:c)', 0),
244
+ 218: ('[#6&!H0](~[#7])', 0),
245
+ 219: ('[#6&!H0](~[#8])', 0),
246
+ 220: ('[#6&!H0](~[#8])(~[#8])', 0),
247
+ 221: ('[#6&!H0](~[#16])', 0),
248
+ 222: ('[#6&!H0](~[#14])', 0),
249
+ 223: ('[#6&!H0]:c', 0),
250
+ 224: ('[#6&!H0](:c)(:c)', 0),
251
+ 225: ('[#6&!H0](:c)(:n)', 0),
252
+ 226: ('[#6&!H0](:n)', 0),
253
+ 227: ('[#6H3]', 0),
254
+ 228: ('[#6](~[#7])(~[#7])', 0),
255
+ 229: ('[#6](~[#7])(:c)', 0),
256
+ 230: ('[#6](~[#7])(:c)(:c)', 0),
257
+ 231: ('[#6](~[#7])(:c)(:n)', 0),
258
+ 232: ('[#6](~[#7])(:n)', 0),
259
+ 233: ('[#6](~[#8])(~[#8])', 0),
260
+ 234: ('[#6](~[#8])(:c)', 0),
261
+ 235: ('[#6](~[#8])(:c)(:c)', 0),
262
+ 236: ('[#6](~[#16])(:c)', 0),
263
+ 237: ('[#6](:c)(:c)', 0),
264
+ 238: ('[#6](:c)(:c)(:c)', 0),
265
+ 239: ('[#6](:c)(:c)(:n)', 0),
266
+ 240: ('[#6](:c)(:n)', 0),
267
+ 241: ('[#6](:c)(:n)(:n)', 0),
268
+ 242: ('[#6](:n)(:n)', 0),
269
+ 243: ('[#7](~[#6])(~[#6])', 0),
270
+ 244: ('[#7](~[#6])(~[#6])(~[#6])', 0),
271
+ 245: ('[#7&!H0](~[#6])(~[#6])', 0),
272
+ 246: ('[#7&!H0](~[#6])', 0),
273
+ 247: ('[#7&!H0](~[#6])(~[#7])', 0),
274
+ 248: ('[#7](~[#6])(~[#8])', 0),
275
+ 249: ('[#7](~[#6])(:c)', 0),
276
+ 250: ('[#7](~[#6])(:c)(:c)', 0),
277
+ 251: ('[#7&!H0](~[#7])', 0),
278
+ 252: ('[#7&!H0](:c)', 0),
279
+ 253: ('[#7&!H0](:c)(:c)', 0),
280
+ 254: ('[#7](~[#8])(~[#8])', 0),
281
+ 255: ('[#7](~[#8])(:o)', 0),
282
+ 256: ('[#7](:c)(:c)', 0),
283
+ 257: ('[#7](:c)(:c)(:c)', 0),
284
+ 258: ('[#8](~[#6])(~[#6])', 0),
285
+ 259: ('[#8&!H0](~[#6])', 0),
286
+ 260: ('[#8](~[#6])(~[#15])', 0),
287
+ 261: ('[#8&!H0](~[#16])', 0),
288
+ 262: ('[#8](:c)(:c)', 0),
289
+ 263: ('[#15](~[#6])(~[#6])', 0),
290
+ 264: ('[#15](~[#8])(~[#8])', 0),
291
+ 265: ('[#16](~[#6])(~[#6])', 0),
292
+ 266: ('[#16&!H0](~[#6])', 0),
293
+ 267: ('[#16](~[#6])(~[#8])', 0),
294
+ 268: ('[#14](~[#6])(~[#6])', 0),
295
+ 269: ('[#6]=,:[#6]', 0),
296
+ 270: ('[#6]#[#6]', 0),
297
+ 271: ('[#6]=,:[#7]', 0),
298
+ 272: ('[#6]#[#7]', 0),
299
+ 273: ('[#6]=,:[#8]', 0),
300
+ 274: ('[#6]=,:[#16]', 0),
301
+ 275: ('[#7]=,:[#7]', 0),
302
+ 276: ('[#7]=,:[#8]', 0),
303
+ 277: ('[#7]=,:[#15]', 0),
304
+ 278: ('[#15]=,:[#8]', 0),
305
+ 279: ('[#15]=,:[#15]', 0),
306
+ 280: ('[#6](#[#6])(-,:[#6])', 0),
307
+ 281: ('[#6&!H0](#[#6])', 0),
308
+ 282: ('[#6](#[#7])(-,:[#6])', 0),
309
+ 283: ('[#6](-,:[#6])(-,:[#6])(=,:[#6])', 0),
310
+ 284: ('[#6](-,:[#6])(-,:[#6])(=,:[#7])', 0),
311
+ 285: ('[#6](-,:[#6])(-,:[#6])(=,:[#8])', 0),
312
+ 286: ('[#6](-,:[#6])([Cl])(=,:[#8])', 0),
313
+ 287: ('[#6&!H0](-,:[#6])(=,:[#6])', 0),
314
+ 288: ('[#6&!H0](-,:[#6])(=,:[#7])', 0),
315
+ 289: ('[#6&!H0](-,:[#6])(=,:[#8])', 0),
316
+ 290: ('[#6](-,:[#6])(-,:[#7])(=,:[#6])', 0),
317
+ 291: ('[#6](-,:[#6])(-,:[#7])(=,:[#7])', 0),
318
+ 292: ('[#6](-,:[#6])(-,:[#7])(=,:[#8])', 0),
319
+ 293: ('[#6](-,:[#6])(-,:[#8])(=,:[#8])', 0),
320
+ 294: ('[#6](-,:[#6])(=,:[#6])', 0),
321
+ 295: ('[#6](-,:[#6])(=,:[#7])', 0),
322
+ 296: ('[#6](-,:[#6])(=,:[#8])', 0),
323
+ 297: ('[#6]([Cl])(=,:[#8])', 0),
324
+ 298: ('[#6&!H0](-,:[#7])(=,:[#6])', 0),
325
+ 299: ('[#6&!H0](=,:[#6])', 0),
326
+ 300: ('[#6&!H0](=,:[#7])', 0),
327
+ 301: ('[#6&!H0](=,:[#8])', 0),
328
+ 302: ('[#6](-,:[#7])(=,:[#6])', 0),
329
+ 303: ('[#6](-,:[#7])(=,:[#7])', 0),
330
+ 304: ('[#6](-,:[#7])(=,:[#8])', 0),
331
+ 305: ('[#6](-,:[#8])(=,:[#8])', 0),
332
+ 306: ('[#7](-,:[#6])(=,:[#6])', 0),
333
+ 307: ('[#7](-,:[#6])(=,:[#8])', 0),
334
+ 308: ('[#7](-,:[#8])(=,:[#8])', 0),
335
+ 309: ('[#15](-,:[#8])(=,:[#8])', 0),
336
+ 310: ('[#16](-,:[#6])(=,:[#8])', 0),
337
+ 311: ('[#16](-,:[#8])(=,:[#8])', 0),
338
+ 312: ('[#16](=,:[#8])(=,:[#8])', 0),
339
+ 313: ('[#6]-,:[#6]-,:[#6]#[#6]', 0),
340
+ 314: ('[#8]-,:[#6]-,:[#6]=,:[#7]', 0),
341
+ 315: ('[#8]-,:[#6]-,:[#6]=,:[#8]', 0),
342
+ 316: ('[#7]:[#6]-,:[#16&!H0]', 0),
343
+ 317: ('[#7]-,:[#6]-,:[#6]=,:[#6]', 0),
344
+ 318: ('[#8]=,:[#16]-,:[#6]-,:[#6]', 0),
345
+ 319: ('[#7]#[#6]-,:[#6]=,:[#6]', 0),
346
+ 320: ('[#6]=,:[#7]-,:[#7]-,:[#6]', 0),
347
+ 321: ('[#8]=,:[#16]-,:[#6]-,:[#7]', 0),
348
+ 322: ('[#16]-,:[#16]-,:[#6]:[#6]', 0),
349
+ 323: ('[#6]:[#6]-,:[#6]=,:[#6]', 0),
350
+ 324: ('[#16]:[#6]:[#6]:[#6]', 0),
351
+ 325: ('[#6]:[#7]:[#6]-,:[#6]', 0),
352
+ 326: ('[#16]-,:[#6]:[#7]:[#6]', 0),
353
+ 327: ('[#16]:[#6]:[#6]:[#7]', 0),
354
+ 328: ('[#16]-,:[#6]=,:[#7]-,:[#6]', 0),
355
+ 329: ('[#6]-,:[#8]-,:[#6]=,:[#6]', 0),
356
+ 330: ('[#7]-,:[#7]-,:[#6]:[#6]', 0),
357
+ 331: ('[#16]-,:[#6]=,:[#7&!H0]', 0),
358
+ 332: ('[#16]-,:[#6]-,:[#16]-,:[#6]', 0),
359
+ 333: ('[#6]:[#16]:[#6]-,:[#6]', 0),
360
+ 334: ('[#8]-,:[#16]-,:[#6]:[#6]', 0),
361
+ 335: ('[#6]:[#7]-,:[#6]:[#6]', 0),
362
+ 336: ('[#7]-,:[#16]-,:[#6]:[#6]', 0),
363
+ 337: ('[#7]-,:[#6]:[#7]:[#6]', 0),
364
+ 338: ('[#7]:[#6]:[#6]:[#7]', 0),
365
+ 339: ('[#7]-,:[#6]:[#7]:[#7]', 0),
366
+ 340: ('[#7]-,:[#6]=,:[#7]-,:[#6]', 0),
367
+ 341: ('[#7]-,:[#6]=,:[#7&!H0]', 0),
368
+ 342: ('[#7]-,:[#6]-,:[#16]-,:[#6]', 0),
369
+ 343: ('[#6]-,:[#6]-,:[#6]=,:[#6]', 0),
370
+ 344: ('[#6]-,:[#7]:[#6&!H0]', 0),
371
+ 345: ('[#7]-,:[#6]:[#8]:[#6]', 0),
372
+ 346: ('[#8]=,:[#6]-,:[#6]:[#6]', 0),
373
+ 347: ('[#8]=,:[#6]-,:[#6]:[#7]', 0),
374
+ 348: ('[#6]-,:[#7]-,:[#6]:[#6]', 0),
375
+ 349: ('[#7]:[#7]-,:[#6&!H0]', 0),
376
+ 350: ('[#8]-,:[#6]:[#6]:[#7]', 0),
377
+ 351: ('[#8]-,:[#6]=,:[#6]-,:[#6]', 0),
378
+ 352: ('[#7]-,:[#6]:[#6]:[#7]', 0),
379
+ 353: ('[#6]-,:[#16]-,:[#6]:[#6]', 0),
380
+ 354: ('[Cl]-,:[#6]:[#6]-,:[#6]', 0),
381
+ 355: ('[#7]-,:[#6]=,:[#6&!H0]', 0),
382
+ 356: ('[Cl]-,:[#6]:[#6&!H0]', 0),
383
+ 357: ('[#7]:[#6]:[#7]-,:[#6]', 0),
384
+ 358: ('[Cl]-,:[#6]:[#6]-,:[#8]', 0),
385
+ 359: ('[#6]-,:[#6]:[#7]:[#6]', 0),
386
+ 360: ('[#6]-,:[#6]-,:[#16]-,:[#6]', 0),
387
+ 361: ('[#16]=,:[#6]-,:[#7]-,:[#6]', 0),
388
+ 362: ('[Br]-,:[#6]:[#6]-,:[#6]', 0),
389
+ 363: ('[#7&!H0]-,:[#7&!H0]', 0),
390
+ 364: ('[#16]=,:[#6]-,:[#7&!H0]', 0),
391
+ 365: ('[#6]-,:[#33]-[#8&!H0]', 0),
392
+ 366: ('[#16]:[#6]:[#6&!H0]', 0),
393
+ 367: ('[#8]-,:[#7]-,:[#6]-,:[#6]', 0),
394
+ 368: ('[#7]-,:[#7]-,:[#6]-,:[#6]', 0),
395
+ 369: ('[#6H,#6H2,#6H3]=,:[#6H,#6H2,#6H3]', 0),
396
+ 370: ('[#7]-,:[#7]-,:[#6]-,:[#7]', 0),
397
+ 371: ('[#8]=,:[#6]-,:[#7]-,:[#7]', 0),
398
+ 372: ('[#7]=,:[#6]-,:[#7]-,:[#6]', 0),
399
+ 373: ('[#6]=,:[#6]-,:[#6]:[#6]', 0),
400
+ 374: ('[#6]:[#7]-,:[#6&!H0]', 0),
401
+ 375: ('[#6]-,:[#7]-,:[#7&!H0]', 0),
402
+ 376: ('[#7]:[#6]:[#6]-,:[#6]', 0),
403
+ 377: ('[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
404
+ 378: ('[#33]-,:[#6]:[#6&!H0]', 0),
405
+ 379: ('[Cl]-,:[#6]:[#6]-,:[Cl]', 0),
406
+ 380: ('[#6]:[#6]:[#7&!H0]', 0),
407
+ 381: ('[#7&!H0]-,:[#6&!H0]', 0),
408
+ 382: ('[Cl]-,:[#6]-,:[#6]-,:[Cl]', 0),
409
+ 383: ('[#7]:[#6]-,:[#6]:[#6]', 0),
410
+ 384: ('[#16]-,:[#6]:[#6]-,:[#6]', 0),
411
+ 385: ('[#16]-,:[#6]:[#6&!H0]', 0),
412
+ 386: ('[#16]-,:[#6]:[#6]-,:[#7]', 0),
413
+ 387: ('[#16]-,:[#6]:[#6]-,:[#8]', 0),
414
+ 388: ('[#8]=,:[#6]-,:[#6]-,:[#6]', 0),
415
+ 389: ('[#8]=,:[#6]-,:[#6]-,:[#7]', 0),
416
+ 390: ('[#8]=,:[#6]-,:[#6]-,:[#8]', 0),
417
+ 391: ('[#7]=,:[#6]-,:[#6]-,:[#6]', 0),
418
+ 392: ('[#7]=,:[#6]-,:[#6&!H0]', 0),
419
+ 393: ('[#6]-,:[#7]-,:[#6&!H0]', 0),
420
+ 394: ('[#8]-,:[#6]:[#6]-,:[#6]', 0),
421
+ 395: ('[#8]-,:[#6]:[#6&!H0]', 0),
422
+ 396: ('[#8]-,:[#6]:[#6]-,:[#7]', 0),
423
+ 397: ('[#8]-,:[#6]:[#6]-,:[#8]', 0),
424
+ 398: ('[#7]-,:[#6]:[#6]-,:[#6]', 0),
425
+ 399: ('[#7]-,:[#6]:[#6&!H0]', 0),
426
+ 400: ('[#7]-,:[#6]:[#6]-,:[#7]', 0),
427
+ 401: ('[#8]-,:[#6]-,:[#6]:[#6]', 0),
428
+ 402: ('[#7]-,:[#6]-,:[#6]:[#6]', 0),
429
+ 403: ('[Cl]-,:[#6]-,:[#6]-,:[#6]', 0),
430
+ 404: ('[Cl]-,:[#6]-,:[#6]-,:[#8]', 0),
431
+ 405: ('[#6]:[#6]-,:[#6]:[#6]', 0),
432
+ 406: ('[#8]=,:[#6]-,:[#6]=,:[#6]', 0),
433
+ 407: ('[Br]-,:[#6]-,:[#6]-,:[#6]', 0),
434
+ 408: ('[#7]=,:[#6]-,:[#6]=,:[#6]', 0),
435
+ 409: ('[#6]=,:[#6]-,:[#6]-,:[#6]', 0),
436
+ 410: ('[#7]:[#6]-,:[#8&!H0]', 0),
437
+ 411: ('[#8]=,:[#7]-,:c:c', 0),
438
+ 412: ('[#8]-,:[#6]-,:[#7&!H0]', 0),
439
+ 413: ('[#7]-,:[#6]-,:[#7]-,:[#6]', 0),
440
+ 414: ('[Cl]-,:[#6]-,:[#6]=,:[#8]', 0),
441
+ 415: ('[Br]-,:[#6]-,:[#6]=,:[#8]', 0),
442
+ 416: ('[#8]-,:[#6]-,:[#8]-,:[#6]', 0),
443
+ 417: ('[#6]=,:[#6]-,:[#6]=,:[#6]', 0),
444
+ 418: ('[#6]:[#6]-,:[#8]-,:[#6]', 0),
445
+ 419: ('[#8]-,:[#6]-,:[#6]-,:[#7]', 0),
446
+ 420: ('[#8]-,:[#6]-,:[#6]-,:[#8]', 0),
447
+ 421: ('N#[#6]-,:[#6]-,:[#6]', 0),
448
+ 422: ('[#7]-,:[#6]-,:[#6]-,:[#7]', 0),
449
+ 423: ('[#6]:[#6]-,:[#6]-,:[#6]', 0),
450
+ 424: ('[#6&!H0]-,:[#8&!H0]', 0),
451
+ 425: ('n:c:n:c', 0),
452
+ 426: ('[#8]-,:[#6]-,:[#6]=,:[#6]', 0),
453
+ 427: ('[#8]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
454
+ 428: ('[#8]-,:[#6]-,:[#6]:[#6]-,:[#8]', 0),
455
+ 429: ('[#7]=,:[#6]-,:[#6]:[#6&!H0]', 0),
456
+ 430: ('c:c-,:[#7]-,:c:c', 0),
457
+ 431: ('[#6]-,:[#6]:[#6]-,:c:c', 0),
458
+ 432: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
459
+ 433: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
460
+ 434: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
461
+ 435: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
462
+ 436: ('[Cl]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
463
+ 437: ('c:c-,:[#6]=,:[#6]-,:[#6]', 0),
464
+ 438: ('[#6]-,:[#6]:[#6]-,:[#7]-,:[#6]', 0),
465
+ 439: ('[#6]-,:[#16]-,:[#6]-,:[#6]-,:[#6]', 0),
466
+ 440: ('[#7]-,:[#6]:[#6]-,:[#8&!H0]', 0),
467
+ 441: ('[#8]=,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
468
+ 442: ('[#6]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
469
+ 443: ('[#6]-,:[#6]:[#6]-,:[#8&!H0]', 0),
470
+ 444: ('[Cl]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
471
+ 445: ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
472
+ 446: ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
473
+ 447: ('[#6]-,:[#8]-,:[#6]-,:[#6]=,:[#6]', 0),
474
+ 448: ('c:c-,:[#6]-,:[#6]-,:[#6]', 0),
475
+ 449: ('[#7]=,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
476
+ 450: ('[#8]=,:[#6]-,:[#6]-,:c:c', 0),
477
+ 451: ('[Cl]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
478
+ 452: ('[#6H,#6H2,#6H3]-,:[#6]=,:[#6H,#6H2,#6H3]', 0),
479
+ 453: ('[#7]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
480
+ 454: ('[#7]-,:[#6]:[#6]:[#6]-,:[#7]', 0),
481
+ 455: ('[#8]=,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
482
+ 456: ('[#6]-,:c:c:[#6]-,:[#6]', 0),
483
+ 457: ('[#6]-,:[#8]-,:[#6]-,:[#6]:c', 0),
484
+ 458: ('[#8]=,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
485
+ 459: ('[#8]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
486
+ 460: ('[#7]-,:[#6]-,:[#6]-,:[#6]:c', 0),
487
+ 461: ('[#6]-,:[#6]-,:[#6]-,:[#6]:c', 0),
488
+ 462: ('[Cl]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
489
+ 463: ('[#6]-,:[#8]-,:[#6]-,:[#8]-,:[#6]', 0),
490
+ 464: ('[#7]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
491
+ 465: ('[#7]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
492
+ 466: ('[#6]-,:[#7]-,:[#6]-,:[#6]-,:[#6]', 0),
493
+ 467: ('[#6]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
494
+ 468: ('[#7]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
495
+ 469: ('c:c:n:n:c', 0),
496
+ 470: ('[#6]-,:[#6]-,:[#6]-,:[#8&!H0]', 0),
497
+ 471: ('c:[#6]-,:[#6]-,:[#6]:c', 0),
498
+ 472: ('[#8]-,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
499
+ 473: ('c:c-,:[#8]-,:[#6]-,:[#6]', 0),
500
+ 474: ('[#7]-,:[#6]:c:c:n', 0),
501
+ 475: ('[#8]=,:[#6]-,:[#8]-,:[#6]:c', 0),
502
+ 476: ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
503
+ 477: ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#7]', 0),
504
+ 478: ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#8]', 0),
505
+ 479: ('[#6]-,:[#8]-,:[#6]:[#6]-,:[#6]', 0),
506
+ 480: ('[#8]=,:[#33]-,:[#6]:c:c', 0),
507
+ 481: ('[#6]-,:[#7]-,:[#6]-,:[#6]:c', 0),
508
+ 482: ('[#16]-,:[#6]:c:c-,:[#7]', 0),
509
+ 483: ('[#8]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
510
+ 484: ('[#8]-,:[#6]:[#6]-,:[#8&!H0]', 0),
511
+ 485: ('[#6]-,:[#6]-,:[#8]-,:[#6]:c', 0),
512
+ 486: ('[#7]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
513
+ 487: ('[#6]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
514
+ 488: ('[#7]-,:[#7]-,:[#6]-,:[#7&!H0]', 0),
515
+ 489: ('[#6]-,:[#7]-,:[#6]-,:[#7]-,:[#6]', 0),
516
+ 490: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
517
+ 491: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
518
+ 492: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
519
+ 493: ('[#6]=,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
520
+ 494: ('[#8]-,:[#6]-,:[#6]-,:[#6]=,:[#6]', 0),
521
+ 495: ('[#8]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
522
+ 496: ('[#6&!H0]-,:[#6]-,:[#7&!H0]', 0),
523
+ 497: ('[#6]-,:[#6]=,:[#7]-,:[#7]-,:[#6]', 0),
524
+ 498: ('[#8]=,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
525
+ 499: ('[#8]=,:[#6]-,:[#7]-,:[#6&!H0]', 0),
526
+ 500: ('[#8]=,:[#6]-,:[#7]-,:[#6]-,:[#7]', 0),
527
+ 501: ('[#8]=,:[#7]-,:[#6]:[#6]-,:[#7]', 0),
528
+ 502: ('[#8]=,:[#7]-,:c:c-,:[#8]', 0),
529
+ 503: ('[#8]=,:[#6]-,:[#7]-,:[#6]=,:[#8]', 0),
530
+ 504: ('[#8]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
531
+ 505: ('[#8]-,:[#6]:[#6]:[#6]-,:[#7]', 0),
532
+ 506: ('[#8]-,:[#6]:[#6]:[#6]-,:[#8]', 0),
533
+ 507: ('[#7]-,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
534
+ 508: ('[#8]-,:[#6]-,:[#6]-,:[#6]:c', 0),
535
+ 509: ('[#6]-,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
536
+ 510: ('[#6]-,:[#7]-,:[#6]:[#6]-,:[#6]', 0),
537
+ 511: ('[#6]-,:[#6]-,:[#16]-,:[#6]-,:[#6]', 0),
538
+ 512: ('[#8]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
539
+ 513: ('[#6]-,:[#6]=,:[#6]-,:[#6]-,:[#6]', 0),
540
+ 514: ('[#8]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
541
+ 515: ('[#8]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
542
+ 516: ('[#8]-,:[#6]-,:[#6]-,:[#8&!H0]', 0),
543
+ 517: ('[#6]-,:[#6]=,:[#6]-,:[#6]=,:[#6]', 0),
544
+ 518: ('[#7]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
545
+ 519: ('[#6]=,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
546
+ 520: ('[#6]=,:[#6]-,:[#6]-,:[#8&!H0]', 0),
547
+ 521: ('[#6]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
548
+ 522: ('[Cl]-,:[#6]:[#6]-,:[#6]=,:[#8]', 0),
549
+ 523: ('[Br]-,:[#6]:c:c-,:[#6]', 0),
550
+ 524: ('[#8]=,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
551
+ 525: ('[#8]=,:[#6]-,:[#6]=,:[#6&!H0]', 0),
552
+ 526: ('[#8]=,:[#6]-,:[#6]=,:[#6]-,:[#7]', 0),
553
+ 527: ('[#7]-,:[#6]-,:[#7]-,:[#6]:c', 0),
554
+ 528: ('[Br]-,:[#6]-,:[#6]-,:[#6]:c', 0),
555
+ 529: ('[#7]#[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
556
+ 530: ('[#6]-,:[#6]=,:[#6]-,:[#6]:c', 0),
557
+ 531: ('[#6]-,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
558
+ 532: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
559
+ 533: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
560
+ 534: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
561
+ 535: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
562
+ 536: ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
563
+ 537: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
564
+ 538: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
565
+ 539: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
566
+ 540: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
567
+ 541: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
568
+ 542: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
569
+ 543: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
570
+ 544: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
571
+ 545: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
572
+ 546: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
573
+ 547: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
574
+ 548: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
575
+ 549: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
576
+ 550: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]', 0),
577
+ 551: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
578
+ 552: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]', 0),
579
+ 553: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
580
+ 554: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#8])-,:[#6]', 0),
581
+ 555: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
582
+ 556: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#7])-,:[#6]', 0),
583
+ 557: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
584
+ 558: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#8])-,:[#6]', 0),
585
+ 559: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](=,:[#8])-,:[#6]', 0),
586
+ 560: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#7])-,:[#6]', 0),
587
+ 561: ('[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]', 0),
588
+ 562: ('[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]-,:[#6]', 0),
589
+ 563: ('[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]', 0),
590
+ 564: ('[#6]-,:[#6](-,:[#6])(-,:[#6])-,:[#6]-,:[#6]', 0),
591
+ 565: ('[#6]-,:[#6](-,:[#6])-,:[#6](-,:[#6])-,:[#6]', 0),
592
+ 566: ('[#6]c1ccc([#6])cc1', 0),
593
+ 567: ('[#6]c1ccc([#8])cc1', 0),
594
+ 568: ('[#6]c1ccc([#16])cc1', 0),
595
+ 569: ('[#6]c1ccc([#7])cc1', 0),
596
+ 570: ('[#6]c1ccc(Cl)cc1', 0),
597
+ 571: ('[#6]c1ccc(Br)cc1', 0),
598
+ 572: ('[#8]c1ccc([#8])cc1', 0),
599
+ 573: ('[#8]c1ccc([#16])cc1', 0),
600
+ 574: ('[#8]c1ccc([#7])cc1', 0),
601
+ 575: ('[#8]c1ccc(Cl)cc1', 0),
602
+ 576: ('[#8]c1ccc(Br)cc1', 0),
603
+ 577: ('[#16]c1ccc([#16])cc1', 0),
604
+ 578: ('[#16]c1ccc([#7])cc1', 0),
605
+ 579: ('[#16]c1ccc(Cl)cc1', 0),
606
+ 580: ('[#16]c1ccc(Br)cc1', 0),
607
+ 581: ('[#7]c1ccc([#7])cc1', 0),
608
+ 582: ('[#7]c1ccc(Cl)cc1', 0),
609
+ 583: ('[#7]c1ccc(Br)cc1', 0),
610
+ 584: ('Clc1ccc(Cl)cc1', 0),
611
+ 585: ('Clc1ccc(Br)cc1', 0),
612
+ 586: ('Brc1ccc(Br)cc1', 0),
613
+ 587: ('[#6]c1cc([#6])ccc1', 0),
614
+ 588: ('[#6]c1cc([#8])ccc1', 0),
615
+ 589: ('[#6]c1cc([#16])ccc1', 0),
616
+ 590: ('[#6]c1cc([#7])ccc1', 0),
617
+ 591: ('[#6]c1cc(Cl)ccc1', 0),
618
+ 592: ('[#6]c1cc(Br)ccc1', 0),
619
+ 593: ('[#8]c1cc([#8])ccc1', 0),
620
+ 594: ('[#8]c1cc([#16])ccc1', 0),
621
+ 595: ('[#8]c1cc([#7])ccc1', 0),
622
+ 596: ('[#8]c1cc(Cl)ccc1', 0),
623
+ 597: ('[#8]c1cc(Br)ccc1', 0),
624
+ 598: ('[#16]c1cc([#16])ccc1', 0),
625
+ 599: ('[#16]c1cc([#7])ccc1', 0),
626
+ 600: ('[#16]c1cc(Cl)ccc1', 0),
627
+ 601: ('[#16]c1cc(Br)ccc1', 0),
628
+ 602: ('[#7]c1cc([#7])ccc1', 0),
629
+ 603: ('[#7]c1cc(Cl)ccc1', 0),
630
+ 604: ('[#7]c1cc(Br)ccc1', 0),
631
+ 605: ('Clc1cc(Cl)ccc1', 0),
632
+ 606: ('Clc1cc(Br)ccc1', 0),
633
+ 607: ('Brc1cc(Br)ccc1', 0),
634
+ 608: ('[#6]c1c([#6])cccc1', 0),
635
+ 609: ('[#6]c1c([#8])cccc1', 0),
636
+ 610: ('[#6]c1c([#16])cccc1', 0),
637
+ 611: ('[#6]c1c([#7])cccc1', 0),
638
+ 612: ('[#6]c1c(Cl)cccc1', 0),
639
+ 613: ('[#6]c1c(Br)cccc1', 0),
640
+ 614: ('[#8]c1c([#8])cccc1', 0),
641
+ 615: ('[#8]c1c([#16])cccc1', 0),
642
+ 616: ('[#8]c1c([#7])cccc1', 0),
643
+ 617: ('[#8]c1c(Cl)cccc1', 0),
644
+ 618: ('[#8]c1c(Br)cccc1', 0),
645
+ 619: ('[#16]c1c([#16])cccc1', 0),
646
+ 620: ('[#16]c1c([#7])cccc1', 0),
647
+ 621: ('[#16]c1c(Cl)cccc1', 0),
648
+ 622: ('[#16]c1c(Br)cccc1', 0),
649
+ 623: ('[#7]c1c([#7])cccc1', 0),
650
+ 624: ('[#7]c1c(Cl)cccc1', 0),
651
+ 625: ('[#7]c1c(Br)cccc1', 0),
652
+ 626: ('Clc1c(Cl)cccc1', 0),
653
+ 627: ('Clc1c(Br)cccc1', 0),
654
+ 628: ('Brc1c(Br)cccc1', 0),
655
+ 629: ('[#6][#6]1[#6][#6][#6]([#6])[#6][#6]1', 0),
656
+ 630: ('[#6][#6]1[#6][#6][#6]([#8])[#6][#6]1', 0),
657
+ 631: ('[#6][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
658
+ 632: ('[#6][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
659
+ 633: ('[#6][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
660
+ 634: ('[#6][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
661
+ 635: ('[#8][#6]1[#6][#6][#6]([#8])[#6][#6]1', 0),
662
+ 636: ('[#8][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
663
+ 637: ('[#8][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
664
+ 638: ('[#8][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
665
+ 639: ('[#8][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
666
+ 640: ('[#16][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
667
+ 641: ('[#16][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
668
+ 642: ('[#16][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
669
+ 643: ('[#16][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
670
+ 644: ('[#7][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
671
+ 645: ('[#7][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
672
+ 646: ('[#7][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
673
+ 647: ('Cl[#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
674
+ 648: ('Cl[#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
675
+ 649: ('Br[#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
676
+ 650: ('[#6][#6]1[#6][#6]([#6])[#6][#6][#6]1', 0),
677
+ 651: ('[#6][#6]1[#6][#6]([#8])[#6][#6][#6]1', 0),
678
+ 652: ('[#6][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
679
+ 653: ('[#6][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
680
+ 654: ('[#6][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
681
+ 655: ('[#6][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
682
+ 656: ('[#8][#6]1[#6][#6]([#8])[#6][#6][#6]1', 0),
683
+ 657: ('[#8][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
684
+ 658: ('[#8][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
685
+ 659: ('[#8][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
686
+ 660: ('[#8][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
687
+ 661: ('[#16][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
688
+ 662: ('[#16][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
689
+ 663: ('[#16][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
690
+ 664: ('[#16][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
691
+ 665: ('[#7][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
692
+ 666: ('[#7][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
693
+ 667: ('[#7][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
694
+ 668: ('Cl[#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
695
+ 669: ('Cl[#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
696
+ 670: ('Br[#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
697
+ 671: ('[#6][#6]1[#6]([#6])[#6][#6][#6][#6]1', 0),
698
+ 672: ('[#6][#6]1[#6]([#8])[#6][#6][#6][#6]1', 0),
699
+ 673: ('[#6][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
700
+ 674: ('[#6][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
701
+ 675: ('[#6][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
702
+ 676: ('[#6][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
703
+ 677: ('[#8][#6]1[#6]([#8])[#6][#6][#6][#6]1', 0),
704
+ 678: ('[#8][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
705
+ 679: ('[#8][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
706
+ 680: ('[#8][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
707
+ 681: ('[#8][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
708
+ 682: ('[#16][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
709
+ 683: ('[#16][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
710
+ 684: ('[#16][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
711
+ 685: ('[#16][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
712
+ 686: ('[#7][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
713
+ 687: ('[#7][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
714
+ 688: ('[#7][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
715
+ 689: ('Cl[#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
716
+ 690: ('Cl[#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
717
+ 691: ('Br[#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
718
+ 692: ('[#6][#6]1[#6][#6]([#6])[#6][#6]1', 0),
719
+ 693: ('[#6][#6]1[#6][#6]([#8])[#6][#6]1', 0),
720
+ 694: ('[#6][#6]1[#6][#6]([#16])[#6][#6]1', 0),
721
+ 695: ('[#6][#6]1[#6][#6]([#7])[#6][#6]1', 0),
722
+ 696: ('[#6][#6]1[#6][#6](Cl)[#6][#6]1', 0),
723
+ 697: ('[#6][#6]1[#6][#6](Br)[#6][#6]1', 0),
724
+ 698: ('[#8][#6]1[#6][#6]([#8])[#6][#6]1', 0),
725
+ 699: ('[#8][#6]1[#6][#6]([#16])[#6][#6]1', 0),
726
+ 700: ('[#8][#6]1[#6][#6]([#7])[#6][#6]1', 0),
727
+ 701: ('[#8][#6]1[#6][#6](Cl)[#6][#6]1', 0),
728
+ 702: ('[#8][#6]1[#6][#6](Br)[#6][#6]1', 0),
729
+ 703: ('[#16][#6]1[#6][#6]([#16])[#6][#6]1', 0),
730
+ 704: ('[#16][#6]1[#6][#6]([#7])[#6][#6]1', 0),
731
+ 705: ('[#16][#6]1[#6][#6](Cl)[#6][#6]1', 0),
732
+ 706: ('[#16][#6]1[#6][#6](Br)[#6][#6]1', 0),
733
+ 707: ('[#7][#6]1[#6][#6]([#7])[#6][#6]1', 0),
734
+ 708: ('[#7][#6]1[#6][#6](Cl)[#6][#6]1', 0),
735
+ 709: ('[#7][#6]1[#6][#6](Br)[#6][#6]1', 0),
736
+ 710: ('Cl[#6]1[#6][#6](Cl)[#6][#6]1', 0),
737
+ 711: ('Cl[#6]1[#6][#6](Br)[#6][#6]1', 0),
738
+ 712: ('Br[#6]1[#6][#6](Br)[#6][#6]1', 0),
739
+ 713: ('[#6][#6]1[#6]([#6])[#6][#6][#6]1', 0),
740
+ 714: ('[#6][#6]1[#6]([#8])[#6][#6][#6]1', 0),
741
+ 715: ('[#6][#6]1[#6]([#16])[#6][#6][#6]1', 0),
742
+ 716: ('[#6][#6]1[#6]([#7])[#6][#6][#6]1', 0),
743
+ 717: ('[#6][#6]1[#6](Cl)[#6][#6][#6]1', 0),
744
+ 718: ('[#6][#6]1[#6](Br)[#6][#6][#6]1', 0),
745
+ 719: ('[#8][#6]1[#6]([#8])[#6][#6][#6]1', 0),
746
+ 720: ('[#8][#6]1[#6]([#16])[#6][#6][#6]1', 0),
747
+ 721: ('[#8][#6]1[#6]([#7])[#6][#6][#6]1', 0),
748
+ 722: ('[#8][#6]1[#6](Cl)[#6][#6][#6]1', 0),
749
+ 723: ('[#8][#6]1[#6](Br)[#6][#6][#6]1', 0),
750
+ 724: ('[#16][#6]1[#6]([#16])[#6][#6][#6]1', 0),
751
+ 725: ('[#16][#6]1[#6]([#7])[#6][#6][#6]1', 0),
752
+ 726: ('[#16][#6]1[#6](Cl)[#6][#6][#6]1', 0),
753
+ 727: ('[#16][#6]1[#6](Br)[#6][#6][#6]1', 0),
754
+ 728: ('[#7][#6]1[#6]([#7])[#6][#6][#6]1', 0),
755
+ 729: ('[#7][#6]1[#6](Cl)[#6][#6]1', 0),
756
+ 730: ('[#7][#6]1[#6](Br)[#6][#6][#6]1', 0),
757
+ 731: ('Cl[#6]1[#6](Cl)[#6][#6][#6]1', 0),
758
+ 732: ('Cl[#6]1[#6](Br)[#6][#6][#6]1', 0),
759
+ 733: ('Br[#6]1[#6](Br)[#6][#6][#6]1', 0)}
760
+
761
+ PubchemKeys = None
762
+
763
+
764
+ def InitKeys(keyList, keyDict):
765
+ """ *Internal Use Only*
766
+ generates SMARTS patterns for the keys, run once
767
+ """
768
+ assert len(keyList) == len(keyDict.keys()), 'length mismatch'
769
+ for key in keyDict.keys():
770
+ patt, count = keyDict[key]
771
+ if patt != '?':
772
+ sma = Chem.MolFromSmarts(patt)
773
+ if not sma:
774
+ print('SMARTS parser error for key #%d: %s' % (key, patt))
775
+ else:
776
+ keyList[key - 1] = sma, count
777
+
778
+
779
+ def calcPubChemFingerPart1(mol, **kwargs):
780
+ """ Calculate PubChem Fingerprints (1-115; 263-881)
781
+ **Arguments**
782
+ - mol: the molecule to be fingerprinted
783
+ - any extra keyword arguments are ignored
784
+ **Returns**
785
+ a _DataStructs.SparseBitVect_ containing the fingerprint.
786
+ >>> m = Chem.MolFromSmiles('CNO')
787
+ >>> bv = PubChemFingerPart1(m)
788
+ >>> tuple(bv.GetOnBits())
789
+ (24, 68, 69, 71, 93, 94, 102, 124, 131, 139, 151, 158, 160, 161, 164)
790
+ >>> bv = PubChemFingerPart1(Chem.MolFromSmiles('CCC'))
791
+ >>> tuple(bv.GetOnBits())
792
+ (74, 114, 149, 155, 160)
793
+ """
794
+ global PubchemKeys
795
+ if PubchemKeys is None:
796
+ PubchemKeys = [(None, 0)] * len(smartsPatts.keys())
797
+ InitKeys(PubchemKeys, smartsPatts)
798
+ ctor = kwargs.get('ctor', DataStructs.SparseBitVect)
799
+ res = ctor(len(PubchemKeys) + 1)
800
+ for i, (patt, count) in enumerate(PubchemKeys):
801
+ if patt is not None:
802
+ if count == 0:
803
+ res[i + 1] = mol.HasSubstructMatch(patt)
804
+ else:
805
+ matches = mol.GetSubstructMatches(patt)
806
+ if len(matches) > count:
807
+ res[i + 1] = 1
808
+ return res
809
+
810
+
811
+ def func_1(mol, bits):
812
+ """ *Internal Use Only*
813
+ Calculate PubChem Fingerprints (116-263)
814
+ """
815
+ ringSize = []
816
+ temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
817
+ AllRingsAtom = mol.GetRingInfo().AtomRings()
818
+ for ring in AllRingsAtom:
819
+ ringSize.append(len(ring))
820
+ for k, v in temp.items():
821
+ if len(ring) == k:
822
+ temp[k] += 1
823
+ if temp[3] >= 2:
824
+ bits[0] = 1
825
+ bits[7] = 1
826
+ elif temp[3] == 1:
827
+ bits[0] = 1
828
+ else:
829
+ pass
830
+ if temp[4] >= 2:
831
+ bits[14] = 1
832
+ bits[21] = 1
833
+ elif temp[4] == 1:
834
+ bits[14] = 1
835
+ else:
836
+ pass
837
+ if temp[5] >= 5:
838
+ bits[28] = 1
839
+ bits[35] = 1
840
+ bits[42] = 1
841
+ bits[49] = 1
842
+ bits[56] = 1
843
+ elif temp[5] == 4:
844
+ bits[28] = 1
845
+ bits[35] = 1
846
+ bits[42] = 1
847
+ bits[49] = 1
848
+ elif temp[5] == 3:
849
+ bits[28] = 1
850
+ bits[35] = 1
851
+ bits[42] = 1
852
+ elif temp[5] == 2:
853
+ bits[28] = 1
854
+ bits[35] = 1
855
+ elif temp[5] == 1:
856
+ bits[28] = 1
857
+ else:
858
+ pass
859
+ if temp[6] >= 5:
860
+ bits[63] = 1
861
+ bits[70] = 1
862
+ bits[77] = 1
863
+ bits[84] = 1
864
+ bits[91] = 1
865
+ elif temp[6] == 4:
866
+ bits[63] = 1
867
+ bits[70] = 1
868
+ bits[77] = 1
869
+ bits[84] = 1
870
+ elif temp[6] == 3:
871
+ bits[63] = 1
872
+ bits[70] = 1
873
+ bits[77] = 1
874
+ elif temp[6] == 2:
875
+ bits[63] = 1
876
+ bits[70] = 1
877
+ elif temp[6] == 1:
878
+ bits[63] = 1
879
+ else:
880
+ pass
881
+ if temp[7] >= 2:
882
+ bits[98] = 1
883
+ bits[105] = 1
884
+ elif temp[7] == 1:
885
+ bits[98] = 1
886
+ else:
887
+ pass
888
+ if temp[8] >= 2:
889
+ bits[112] = 1
890
+ bits[119] = 1
891
+ elif temp[8] == 1:
892
+ bits[112] = 1
893
+ else:
894
+ pass
895
+ if temp[9] >= 1:
896
+ bits[126] = 1
897
+ else:
898
+ pass
899
+ if temp[10] >= 1:
900
+ bits[133] = 1
901
+ else:
902
+ pass
903
+
904
+ return ringSize, bits
905
+
906
+
907
+ def func_2(mol, bits):
908
+ """ *Internal Use Only*
909
+ saturated or aromatic carbon-only ring
910
+ """
911
+ AllRingsBond = mol.GetRingInfo().BondRings()
912
+ ringSize = []
913
+ temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
914
+ for ring in AllRingsBond:
915
+ ######### saturated
916
+ nonsingle = False
917
+ for bondIdx in ring:
918
+ if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
919
+ nonsingle = True
920
+ break
921
+ if nonsingle == False:
922
+ ringSize.append(len(ring))
923
+ for k, v in temp.items():
924
+ if len(ring) == k:
925
+ temp[k] += 1
926
+ ######## aromatic carbon-only
927
+ aromatic = True
928
+ AllCarb = True
929
+ for bondIdx in ring:
930
+ if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'AROMATIC':
931
+ aromatic = False
932
+ break
933
+ for bondIdx in ring:
934
+ BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
935
+ EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
936
+ if BeginAtom.GetAtomicNum() != 6 or EndAtom.GetAtomicNum() != 6:
937
+ AllCarb = False
938
+ break
939
+ if aromatic == True and AllCarb == True:
940
+ ringSize.append(len(ring))
941
+ for k, v in temp.items():
942
+ if len(ring) == k:
943
+ temp[k] += 1
944
+ if temp[3] >= 2:
945
+ bits[1] = 1
946
+ bits[8] = 1
947
+ elif temp[3] == 1:
948
+ bits[1] = 1
949
+ else:
950
+ pass
951
+ if temp[4] >= 2:
952
+ bits[15] = 1
953
+ bits[22] = 1
954
+ elif temp[4] == 1:
955
+ bits[15] = 1
956
+ else:
957
+ pass
958
+ if temp[5] >= 5:
959
+ bits[29] = 1
960
+ bits[36] = 1
961
+ bits[43] = 1
962
+ bits[50] = 1
963
+ bits[57] = 1
964
+ elif temp[5] == 4:
965
+ bits[29] = 1
966
+ bits[36] = 1
967
+ bits[43] = 1
968
+ bits[50] = 1
969
+ elif temp[5] == 3:
970
+ bits[29] = 1
971
+ bits[36] = 1
972
+ bits[43] = 1
973
+ elif temp[5] == 2:
974
+ bits[29] = 1
975
+ bits[36] = 1
976
+ elif temp[5] == 1:
977
+ bits[29] = 1
978
+ else:
979
+ pass
980
+ if temp[6] >= 5:
981
+ bits[64] = 1
982
+ bits[71] = 1
983
+ bits[78] = 1
984
+ bits[85] = 1
985
+ bits[92] = 1
986
+ elif temp[6] == 4:
987
+ bits[64] = 1
988
+ bits[71] = 1
989
+ bits[78] = 1
990
+ bits[85] = 1
991
+ elif temp[6] == 3:
992
+ bits[64] = 1
993
+ bits[71] = 1
994
+ bits[78] = 1
995
+ elif temp[6] == 2:
996
+ bits[64] = 1
997
+ bits[71] = 1
998
+ elif temp[6] == 1:
999
+ bits[64] = 1
1000
+ else:
1001
+ pass
1002
+ if temp[7] >= 2:
1003
+ bits[99] = 1
1004
+ bits[106] = 1
1005
+ elif temp[7] == 1:
1006
+ bits[99] = 1
1007
+ else:
1008
+ pass
1009
+ if temp[8] >= 2:
1010
+ bits[113] = 1
1011
+ bits[120] = 1
1012
+ elif temp[8] == 1:
1013
+ bits[113] = 1
1014
+ else:
1015
+ pass
1016
+ if temp[9] >= 1:
1017
+ bits[127] = 1
1018
+ else:
1019
+ pass
1020
+ if temp[10] >= 1:
1021
+ bits[134] = 1
1022
+ else:
1023
+ pass
1024
+ return ringSize, bits
1025
+
1026
+
1027
+ def func_3(mol, bits):
1028
+ """ *Internal Use Only*
1029
+ saturated or aromatic nitrogen-containing
1030
+ """
1031
+ AllRingsBond = mol.GetRingInfo().BondRings()
1032
+ ringSize = []
1033
+ temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
1034
+ for ring in AllRingsBond:
1035
+ ######### saturated
1036
+ nonsingle = False
1037
+ for bondIdx in ring:
1038
+ if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
1039
+ nonsingle = True
1040
+ break
1041
+ if nonsingle == False:
1042
+ ringSize.append(len(ring))
1043
+ for k, v in temp.items():
1044
+ if len(ring) == k:
1045
+ temp[k] += 1
1046
+ ######## aromatic nitrogen-containing
1047
+ aromatic = True
1048
+ ContainNitro = False
1049
+ for bondIdx in ring:
1050
+ if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'AROMATIC':
1051
+ aromatic = False
1052
+ break
1053
+ for bondIdx in ring:
1054
+ BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
1055
+ EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
1056
+ if BeginAtom.GetAtomicNum() == 7 or EndAtom.GetAtomicNum() == 7:
1057
+ ContainNitro = True
1058
+ break
1059
+ if aromatic == True and ContainNitro == True:
1060
+ ringSize.append(len(ring))
1061
+ for k, v in temp.items():
1062
+ if len(ring) == k:
1063
+ temp[k] += 1
1064
+ if temp[3] >= 2:
1065
+ bits[2] = 1
1066
+ bits[9] = 1
1067
+ elif temp[3] == 1:
1068
+ bits[2] = 1
1069
+ else:
1070
+ pass
1071
+ if temp[4] >= 2:
1072
+ bits[16] = 1
1073
+ bits[23] = 1
1074
+ elif temp[4] == 1:
1075
+ bits[16] = 1
1076
+ else:
1077
+ pass
1078
+ if temp[5] >= 5:
1079
+ bits[30] = 1
1080
+ bits[37] = 1
1081
+ bits[44] = 1
1082
+ bits[51] = 1
1083
+ bits[58] = 1
1084
+ elif temp[5] == 4:
1085
+ bits[30] = 1
1086
+ bits[37] = 1
1087
+ bits[44] = 1
1088
+ bits[51] = 1
1089
+ elif temp[5] == 3:
1090
+ bits[30] = 1
1091
+ bits[37] = 1
1092
+ bits[44] = 1
1093
+ elif temp[5] == 2:
1094
+ bits[30] = 1
1095
+ bits[37] = 1
1096
+ elif temp[5] == 1:
1097
+ bits[30] = 1
1098
+ else:
1099
+ pass
1100
+ if temp[6] >= 5:
1101
+ bits[65] = 1
1102
+ bits[72] = 1
1103
+ bits[79] = 1
1104
+ bits[86] = 1
1105
+ bits[93] = 1
1106
+ elif temp[6] == 4:
1107
+ bits[65] = 1
1108
+ bits[72] = 1
1109
+ bits[79] = 1
1110
+ bits[86] = 1
1111
+ elif temp[6] == 3:
1112
+ bits[65] = 1
1113
+ bits[72] = 1
1114
+ bits[79] = 1
1115
+ elif temp[6] == 2:
1116
+ bits[65] = 1
1117
+ bits[72] = 1
1118
+ elif temp[6] == 1:
1119
+ bits[65] = 1
1120
+ else:
1121
+ pass
1122
+ if temp[7] >= 2:
1123
+ bits[100] = 1
1124
+ bits[107] = 1
1125
+ elif temp[7] == 1:
1126
+ bits[100] = 1
1127
+ else:
1128
+ pass
1129
+ if temp[8] >= 2:
1130
+ bits[114] = 1
1131
+ bits[121] = 1
1132
+ elif temp[8] == 1:
1133
+ bits[114] = 1
1134
+ else:
1135
+ pass
1136
+ if temp[9] >= 1:
1137
+ bits[128] = 1
1138
+ else:
1139
+ pass
1140
+ if temp[10] >= 1:
1141
+ bits[135] = 1
1142
+ else:
1143
+ pass
1144
+ return ringSize, bits
1145
+
1146
+
1147
+ def func_4(mol, bits):
1148
+ """ *Internal Use Only*
1149
+ saturated or aromatic heteroatom-containing
1150
+ """
1151
+ AllRingsBond = mol.GetRingInfo().BondRings()
1152
+ ringSize = []
1153
+ temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
1154
+ for ring in AllRingsBond:
1155
+ ######### saturated
1156
+ nonsingle = False
1157
+ for bondIdx in ring:
1158
+ if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
1159
+ nonsingle = True
1160
+ break
1161
+ if nonsingle == False:
1162
+ ringSize.append(len(ring))
1163
+ for k, v in temp.items():
1164
+ if len(ring) == k:
1165
+ temp[k] += 1
1166
+ ######## aromatic heteroatom-containing
1167
+ aromatic = True
1168
+ heteroatom = False
1169
+ for bondIdx in ring:
1170
+ if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'AROMATIC':
1171
+ aromatic = False
1172
+ break
1173
+ for bondIdx in ring:
1174
+ BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
1175
+ EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
1176
+ if BeginAtom.GetAtomicNum() not in [1, 6] or EndAtom.GetAtomicNum() not in [1, 6]:
1177
+ heteroatom = True
1178
+ break
1179
+ if aromatic == True and heteroatom == True:
1180
+ ringSize.append(len(ring))
1181
+ for k, v in temp.items():
1182
+ if len(ring) == k:
1183
+ temp[k] += 1
1184
+ if temp[3] >= 2:
1185
+ bits[3] = 1
1186
+ bits[10] = 1
1187
+ elif temp[3] == 1:
1188
+ bits[3] = 1
1189
+ else:
1190
+ pass
1191
+ if temp[4] >= 2:
1192
+ bits[17] = 1
1193
+ bits[24] = 1
1194
+ elif temp[4] == 1:
1195
+ bits[17] = 1
1196
+ else:
1197
+ pass
1198
+ if temp[5] >= 5:
1199
+ bits[31] = 1
1200
+ bits[38] = 1
1201
+ bits[45] = 1
1202
+ bits[52] = 1
1203
+ bits[59] = 1
1204
+ elif temp[5] == 4:
1205
+ bits[31] = 1
1206
+ bits[38] = 1
1207
+ bits[45] = 1
1208
+ bits[52] = 1
1209
+ elif temp[5] == 3:
1210
+ bits[31] = 1
1211
+ bits[38] = 1
1212
+ bits[45] = 1
1213
+ elif temp[5] == 2:
1214
+ bits[31] = 1
1215
+ bits[38] = 1
1216
+ elif temp[5] == 1:
1217
+ bits[31] = 1
1218
+ else:
1219
+ pass
1220
+ if temp[6] >= 5:
1221
+ bits[66] = 1
1222
+ bits[73] = 1
1223
+ bits[80] = 1
1224
+ bits[87] = 1
1225
+ bits[94] = 1
1226
+ elif temp[6] == 4:
1227
+ bits[66] = 1
1228
+ bits[73] = 1
1229
+ bits[80] = 1
1230
+ bits[87] = 1
1231
+ elif temp[6] == 3:
1232
+ bits[66] = 1
1233
+ bits[73] = 1
1234
+ bits[80] = 1
1235
+ elif temp[6] == 2:
1236
+ bits[66] = 1
1237
+ bits[73] = 1
1238
+ elif temp[6] == 1:
1239
+ bits[66] = 1
1240
+ else:
1241
+ pass
1242
+ if temp[7] >= 2:
1243
+ bits[101] = 1
1244
+ bits[108] = 1
1245
+ elif temp[7] == 1:
1246
+ bits[101] = 1
1247
+ else:
1248
+ pass
1249
+ if temp[8] >= 2:
1250
+ bits[115] = 1
1251
+ bits[122] = 1
1252
+ elif temp[8] == 1:
1253
+ bits[115] = 1
1254
+ else:
1255
+ pass
1256
+ if temp[9] >= 1:
1257
+ bits[129] = 1
1258
+ else:
1259
+ pass
1260
+ if temp[10] >= 1:
1261
+ bits[136] = 1
1262
+ else:
1263
+ pass
1264
+ return ringSize, bits
1265
+
1266
+
1267
+ def func_5(mol, bits):
1268
+ """ *Internal Use Only*
1269
+ unsaturated non-aromatic carbon-only
1270
+ """
1271
+ ringSize = []
1272
+ AllRingsBond = mol.GetRingInfo().BondRings()
1273
+ temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
1274
+ for ring in AllRingsBond:
1275
+ unsaturated = False
1276
+ nonaromatic = True
1277
+ Allcarb = True
1278
+ ######### unsaturated
1279
+ for bondIdx in ring:
1280
+ if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
1281
+ unsaturated = True
1282
+ break
1283
+ ######## non-aromatic
1284
+ for bondIdx in ring:
1285
+ if mol.GetBondWithIdx(bondIdx).GetBondType().name == 'AROMATIC':
1286
+ nonaromatic = False
1287
+ break
1288
+ ######## allcarb
1289
+ for bondIdx in ring:
1290
+ BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
1291
+ EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
1292
+ if BeginAtom.GetAtomicNum() != 6 or EndAtom.GetAtomicNum() != 6:
1293
+ Allcarb = False
1294
+ break
1295
+ if unsaturated == True and nonaromatic == True and Allcarb == True:
1296
+ ringSize.append(len(ring))
1297
+ for k, v in temp.items():
1298
+ if len(ring) == k:
1299
+ temp[k] += 1
1300
+ if temp[3] >= 2:
1301
+ bits[4] = 1
1302
+ bits[11] = 1
1303
+ elif temp[3] == 1:
1304
+ bits[4] = 1
1305
+ else:
1306
+ pass
1307
+ if temp[4] >= 2:
1308
+ bits[18] = 1
1309
+ bits[25] = 1
1310
+ elif temp[4] == 1:
1311
+ bits[18] = 1
1312
+ else:
1313
+ pass
1314
+ if temp[5] >= 5:
1315
+ bits[32] = 1
1316
+ bits[39] = 1
1317
+ bits[46] = 1
1318
+ bits[53] = 1
1319
+ bits[60] = 1
1320
+ elif temp[5] == 4:
1321
+ bits[32] = 1
1322
+ bits[39] = 1
1323
+ bits[46] = 1
1324
+ bits[53] = 1
1325
+ elif temp[5] == 3:
1326
+ bits[32] = 1
1327
+ bits[39] = 1
1328
+ bits[46] = 1
1329
+ elif temp[5] == 2:
1330
+ bits[32] = 1
1331
+ bits[39] = 1
1332
+ elif temp[5] == 1:
1333
+ bits[32] = 1
1334
+ else:
1335
+ pass
1336
+ if temp[6] >= 5:
1337
+ bits[67] = 1
1338
+ bits[74] = 1
1339
+ bits[81] = 1
1340
+ bits[88] = 1
1341
+ bits[95] = 1
1342
+ elif temp[6] == 4:
1343
+ bits[67] = 1
1344
+ bits[74] = 1
1345
+ bits[81] = 1
1346
+ bits[88] = 1
1347
+ elif temp[6] == 3:
1348
+ bits[67] = 1
1349
+ bits[74] = 1
1350
+ bits[81] = 1
1351
+ elif temp[6] == 2:
1352
+ bits[67] = 1
1353
+ bits[74] = 1
1354
+ elif temp[6] == 1:
1355
+ bits[67] = 1
1356
+ else:
1357
+ pass
1358
+ if temp[7] >= 2:
1359
+ bits[102] = 1
1360
+ bits[109] = 1
1361
+ elif temp[7] == 1:
1362
+ bits[102] = 1
1363
+ else:
1364
+ pass
1365
+ if temp[8] >= 2:
1366
+ bits[116] = 1
1367
+ bits[123] = 1
1368
+ elif temp[8] == 1:
1369
+ bits[116] = 1
1370
+ else:
1371
+ pass
1372
+ if temp[9] >= 1:
1373
+ bits[130] = 1
1374
+ else:
1375
+ pass
1376
+ if temp[10] >= 1:
1377
+ bits[137] = 1
1378
+ else:
1379
+ pass
1380
+ return ringSize, bits
1381
+
1382
+
1383
+ def func_6(mol, bits):
1384
+ """ *Internal Use Only*
1385
+ unsaturated non-aromatic nitrogen-containing
1386
+ """
1387
+ ringSize = []
1388
+ AllRingsBond = mol.GetRingInfo().BondRings()
1389
+ temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
1390
+ for ring in AllRingsBond:
1391
+ unsaturated = False
1392
+ nonaromatic = True
1393
+ ContainNitro = False
1394
+ ######### unsaturated
1395
+ for bondIdx in ring:
1396
+ if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
1397
+ unsaturated = True
1398
+ break
1399
+ ######## non-aromatic
1400
+ for bondIdx in ring:
1401
+ if mol.GetBondWithIdx(bondIdx).GetBondType().name == 'AROMATIC':
1402
+ nonaromatic = False
1403
+ break
1404
+ ######## nitrogen-containing
1405
+ for bondIdx in ring:
1406
+ BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
1407
+ EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
1408
+ if BeginAtom.GetAtomicNum() == 7 or EndAtom.GetAtomicNum() == 7:
1409
+ ContainNitro = True
1410
+ break
1411
+ if unsaturated == True and nonaromatic == True and ContainNitro == True:
1412
+ ringSize.append(len(ring))
1413
+ for k, v in temp.items():
1414
+ if len(ring) == k:
1415
+ temp[k] += 1
1416
+ if temp[3] >= 2:
1417
+ bits[5] = 1
1418
+ bits[12] = 1
1419
+ elif temp[3] == 1:
1420
+ bits[5] = 1
1421
+ else:
1422
+ pass
1423
+ if temp[4] >= 2:
1424
+ bits[19] = 1
1425
+ bits[26] = 1
1426
+ elif temp[4] == 1:
1427
+ bits[19] = 1
1428
+ else:
1429
+ pass
1430
+ if temp[5] >= 5:
1431
+ bits[33] = 1
1432
+ bits[40] = 1
1433
+ bits[47] = 1
1434
+ bits[54] = 1
1435
+ bits[61] = 1
1436
+ elif temp[5] == 4:
1437
+ bits[33] = 1
1438
+ bits[40] = 1
1439
+ bits[47] = 1
1440
+ bits[54] = 1
1441
+ elif temp[5] == 3:
1442
+ bits[33] = 1
1443
+ bits[40] = 1
1444
+ bits[47] = 1
1445
+ elif temp[5] == 2:
1446
+ bits[33] = 1
1447
+ bits[40] = 1
1448
+ elif temp[5] == 1:
1449
+ bits[33] = 1
1450
+ else:
1451
+ pass
1452
+ if temp[6] >= 5:
1453
+ bits[68] = 1
1454
+ bits[75] = 1
1455
+ bits[82] = 1
1456
+ bits[89] = 1
1457
+ bits[96] = 1
1458
+ elif temp[6] == 4:
1459
+ bits[68] = 1
1460
+ bits[75] = 1
1461
+ bits[82] = 1
1462
+ bits[89] = 1
1463
+ elif temp[6] == 3:
1464
+ bits[68] = 1
1465
+ bits[75] = 1
1466
+ bits[82] = 1
1467
+ elif temp[6] == 2:
1468
+ bits[68] = 1
1469
+ bits[75] = 1
1470
+ elif temp[6] == 1:
1471
+ bits[68] = 1
1472
+ else:
1473
+ pass
1474
+ if temp[7] >= 2:
1475
+ bits[103] = 1
1476
+ bits[110] = 1
1477
+ elif temp[7] == 1:
1478
+ bits[103] = 1
1479
+ else:
1480
+ pass
1481
+ if temp[8] >= 2:
1482
+ bits[117] = 1
1483
+ bits[124] = 1
1484
+ elif temp[8] == 1:
1485
+ bits[117] = 1
1486
+ else:
1487
+ pass
1488
+ if temp[9] >= 1:
1489
+ bits[131] = 1
1490
+ else:
1491
+ pass
1492
+ if temp[10] >= 1:
1493
+ bits[138] = 1
1494
+ else:
1495
+ pass
1496
+ return ringSize, bits
1497
+
1498
+
1499
+ def func_7(mol, bits):
1500
+ """ *Internal Use Only*
1501
+ unsaturated non-aromatic heteroatom-containing
1502
+ """
1503
+ ringSize = []
1504
+ AllRingsBond = mol.GetRingInfo().BondRings()
1505
+ temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
1506
+ for ring in AllRingsBond:
1507
+ unsaturated = False
1508
+ nonaromatic = True
1509
+ heteroatom = False
1510
+ ######### unsaturated
1511
+ for bondIdx in ring:
1512
+ if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
1513
+ unsaturated = True
1514
+ break
1515
+ ######## non-aromatic
1516
+ for bondIdx in ring:
1517
+ if mol.GetBondWithIdx(bondIdx).GetBondType().name == 'AROMATIC':
1518
+ nonaromatic = False
1519
+ break
1520
+ ######## heteroatom-containing
1521
+ for bondIdx in ring:
1522
+ BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
1523
+ EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
1524
+ if BeginAtom.GetAtomicNum() not in [1, 6] or EndAtom.GetAtomicNum() not in [1, 6]:
1525
+ heteroatom = True
1526
+ break
1527
+ if unsaturated == True and nonaromatic == True and heteroatom == True:
1528
+ ringSize.append(len(ring))
1529
+ for k, v in temp.items():
1530
+ if len(ring) == k:
1531
+ temp[k] += 1
1532
+ if temp[3] >= 2:
1533
+ bits[6] = 1
1534
+ bits[13] = 1
1535
+ elif temp[3] == 1:
1536
+ bits[6] = 1
1537
+ else:
1538
+ pass
1539
+ if temp[4] >= 2:
1540
+ bits[20] = 1
1541
+ bits[27] = 1
1542
+ elif temp[4] == 1:
1543
+ bits[20] = 1
1544
+ else:
1545
+ pass
1546
+ if temp[5] >= 5:
1547
+ bits[34] = 1
1548
+ bits[41] = 1
1549
+ bits[48] = 1
1550
+ bits[55] = 1
1551
+ bits[62] = 1
1552
+ elif temp[5] == 4:
1553
+ bits[34] = 1
1554
+ bits[41] = 1
1555
+ bits[48] = 1
1556
+ bits[55] = 1
1557
+ elif temp[5] == 3:
1558
+ bits[34] = 1
1559
+ bits[41] = 1
1560
+ bits[48] = 1
1561
+ elif temp[5] == 2:
1562
+ bits[34] = 1
1563
+ bits[41] = 1
1564
+ elif temp[5] == 1:
1565
+ bits[34] = 1
1566
+ else:
1567
+ pass
1568
+ if temp[6] >= 5:
1569
+ bits[69] = 1
1570
+ bits[76] = 1
1571
+ bits[83] = 1
1572
+ bits[90] = 1
1573
+ bits[97] = 1
1574
+ elif temp[6] == 4:
1575
+ bits[69] = 1
1576
+ bits[76] = 1
1577
+ bits[83] = 1
1578
+ bits[90] = 1
1579
+ elif temp[6] == 3:
1580
+ bits[69] = 1
1581
+ bits[76] = 1
1582
+ bits[83] = 1
1583
+ elif temp[6] == 2:
1584
+ bits[69] = 1
1585
+ bits[76] = 1
1586
+ elif temp[6] == 1:
1587
+ bits[69] = 1
1588
+ else:
1589
+ pass
1590
+ if temp[7] >= 2:
1591
+ bits[104] = 1
1592
+ bits[111] = 1
1593
+ elif temp[7] == 1:
1594
+ bits[104] = 1
1595
+ else:
1596
+ pass
1597
+ if temp[8] >= 2:
1598
+ bits[118] = 1
1599
+ bits[125] = 1
1600
+ elif temp[8] == 1:
1601
+ bits[118] = 1
1602
+ else:
1603
+ pass
1604
+ if temp[9] >= 1:
1605
+ bits[132] = 1
1606
+ else:
1607
+ pass
1608
+ if temp[10] >= 1:
1609
+ bits[139] = 1
1610
+ else:
1611
+ pass
1612
+ return ringSize, bits
1613
+
1614
+
1615
+ def func_8(mol, bits):
1616
+ """ *Internal Use Only*
1617
+ aromatic rings or hetero-aromatic rings
1618
+ """
1619
+ AllRingsBond = mol.GetRingInfo().BondRings()
1620
+ temp = {'aromatic': 0, 'heteroatom': 0}
1621
+ for ring in AllRingsBond:
1622
+ aromatic = True
1623
+ heteroatom = False
1624
+ for bondIdx in ring:
1625
+ if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'AROMATIC':
1626
+ aromatic = False
1627
+ break
1628
+ if aromatic == True:
1629
+ temp['aromatic'] += 1
1630
+ for bondIdx in ring:
1631
+ BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
1632
+ EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
1633
+ if BeginAtom.GetAtomicNum() not in [1, 6] or EndAtom.GetAtomicNum() not in [1, 6]:
1634
+ heteroatom = True
1635
+ break
1636
+ if heteroatom == True:
1637
+ temp['heteroatom'] += 1
1638
+ if temp['aromatic'] >= 4:
1639
+ bits[140] = 1
1640
+ bits[142] = 1
1641
+ bits[144] = 1
1642
+ bits[146] = 1
1643
+ elif temp['aromatic'] == 3:
1644
+ bits[140] = 1
1645
+ bits[142] = 1
1646
+ bits[144] = 1
1647
+ elif temp['aromatic'] == 2:
1648
+ bits[140] = 1
1649
+ bits[142] = 1
1650
+ elif temp['aromatic'] == 1:
1651
+ bits[140] = 1
1652
+ else:
1653
+ pass
1654
+ if temp['aromatic'] >= 4 and temp['heteroatom'] >= 4:
1655
+ bits[141] = 1
1656
+ bits[143] = 1
1657
+ bits[145] = 1
1658
+ bits[147] = 1
1659
+ elif temp['aromatic'] == 3 and temp['heteroatom'] == 3:
1660
+ bits[141] = 1
1661
+ bits[143] = 1
1662
+ bits[145] = 1
1663
+ elif temp['aromatic'] == 2 and temp['heteroatom'] == 2:
1664
+ bits[141] = 1
1665
+ bits[143] = 1
1666
+ elif temp['aromatic'] == 1 and temp['heteroatom'] == 1:
1667
+ bits[141] = 1
1668
+ else:
1669
+ pass
1670
+ return bits
1671
+
1672
+
1673
+ def calcPubChemFingerPart2(mol): # 116-263
1674
+ """ *Internal Use Only*
1675
+ Calculate PubChem Fingerprints (116-263)
1676
+ """
1677
+ bits = [0] * 148
1678
+ bits = func_1(mol, bits)[1]
1679
+ bits = func_2(mol, bits)[1]
1680
+ bits = func_3(mol, bits)[1]
1681
+ bits = func_4(mol, bits)[1]
1682
+ bits = func_5(mol, bits)[1]
1683
+ bits = func_6(mol, bits)[1]
1684
+ bits = func_7(mol, bits)[1]
1685
+ bits = func_8(mol, bits)
1686
+
1687
+ return bits
1688
+
1689
+
1690
+ def GetPubChemFPs(mol):
1691
+ """*Internal Use Only*
1692
+ Calculate PubChem Fingerprints
1693
+ """
1694
+ mol = Chem.AddHs(mol)
1695
+ AllBits = [0] * 881
1696
+ res1 = list(calcPubChemFingerPart1(mol).ToBitString())
1697
+ for index, item in enumerate(res1[1:116]):
1698
+ if item == '1':
1699
+ AllBits[index] = 1
1700
+ for index2, item2 in enumerate(res1[116:734]):
1701
+ if item2 == '1':
1702
+ AllBits[index2 + 115 + 148] = 1
1703
+ res2 = calcPubChemFingerPart2(mol)
1704
+ for index3, item3 in enumerate(res2):
1705
+ if item3 == 1:
1706
+ AllBits[index3 + 115] = 1
1707
+ AllBits = np.array(AllBits, dtype=np.bool_)
1708
+
1709
+ return AllBits
1710
+
1711
+
1712
+ # ------------------------------------
1713
+
1714
+
1715
+ file_path = os.path.dirname(__file__)
1716
+
1717
+
1718
+ def GetPubChemFPInfos():
1719
+ return pd.read_excel(os.path.join(file_path, 'pubchemfp.xlsx'))
1720
+
1721
+
1722
+ if __name__ == '__main__':
1723
+ print('-' * 10 + 'START' + '-' * 10)
1724
+ SMILES = 'C1=NC2NC3=CNCC3=CC2CC1'
1725
+ mol = Chem.MolFromSmiles(SMILES)
1726
+ mol2 = Chem.AddHs(mol)
1727
+ result = GetPubChemFPs(mol2)
1728
+ print('Molecule: %s' % SMILES)
1729
+ print('-' * 25)
1730
+ print('Results: %s' % result)
1731
+ print('-' * 10 + 'END' + '-' * 10)
deepscreen/data/featurizers/fingerprint/pubchemfp.xlsx ADDED
Binary file (41.2 kB). View file
 
deepscreen/data/featurizers/fingerprint/rdkitfp.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ topological fingerprint
3
+
4
+ """
5
+
6
+ import numpy as np
7
+ from rdkit.Chem.rdmolops import RDKFingerprint
8
+ from rdkit.Chem import DataStructs
9
+
10
+ _type = 'topological-based'
11
+
12
+
13
+ def GetRDkitFPs(mol, nBits=2048, return_bitInfo=False):
14
+ """
15
+ #################################################################
16
+ Calculate Daylight-like fingerprint or topological fingerprint
17
+
18
+ (1024 bits).
19
+
20
+ Usage:
21
+
22
+ result=CalculateDaylightFingerprint(mol)
23
+
24
+ Input: mol is a molecule object.
25
+
26
+ Output: result is a tuple form. The first is the number of
27
+
28
+ fingerprints. The second is a dict form whose keys are the
29
+
30
+ position which this molecule has some substructure. The third
31
+
32
+ is the DataStructs which is used for calculating the similarity.
33
+ #################################################################
34
+ """
35
+
36
+ bitInfo = {}
37
+ fp = RDKFingerprint(mol, fpSize=nBits, bitInfo=bitInfo)
38
+ arr = np.zeros((0,), dtype=np.bool_)
39
+ DataStructs.ConvertToNumpyArray(fp, arr)
40
+ if return_bitInfo:
41
+ return arr, return_bitInfo
42
+ return arr
deepscreen/data/featurizers/fingerprint/smarts_maccskey.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ smartsPatts = {
2
+ 'MACCSFP0': (None, 0),
3
+ # ignore, Bit 0 is a placeholder and should be ignored: https://github.com/rdkit/rdkit/issues/1726
4
+ 'MACCSFP1': ('?', 0),
5
+ 'MACCSFP2': ('[#104]', 0),
6
+ 'MACCSFP3': ('[#32,#33,#34,#50,#51,#52,#82,#83,#84]', 0),
7
+ 'MACCSFP4': ('[Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr]', 0),
8
+ 'MACCSFP5': ('[Sc,Ti,Y,Zr,Hf]', 0),
9
+ 'MACCSFP6': ('[La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu]', 0),
10
+ 'MACCSFP7': ('[V,Cr,Mn,Nb,Mo,Tc,Ta,W,Re]', 0),
11
+ 'MACCSFP8': ('[!#6;!#1]1~*~*~*~1', 0),
12
+ 'MACCSFP9': ('[Fe,Co,Ni,Ru,Rh,Pd,Os,Ir,Pt]', 0),
13
+ 'MACCSFP10': ('[Be,Mg,Ca,Sr,Ba,Ra]', 0),
14
+ 'MACCSFP11': ('*1~*~*~*~1', 0),
15
+ 'MACCSFP12': ('[Cu,Zn,Ag,Cd,Au,Hg]', 0),
16
+ 'MACCSFP13': ('[#8]~[#7](~[#6])~[#6]', 0),
17
+ 'MACCSFP14': ('[#16]-[#16]', 0),
18
+ 'MACCSFP15': ('[#8]~[#6](~[#8])~[#8]', 0),
19
+ 'MACCSFP16': ('[!#6;!#1]1~*~*~1', 0),
20
+ 'MACCSFP17': ('[#6]#[#6]', 0),
21
+ 'MACCSFP18': ('[#5,#13,#31,#49,#81]', 0),
22
+ 'MACCSFP19': ('*1~*~*~*~*~*~*~1', 0),
23
+ 'MACCSFP20': ('[#14]', 0),
24
+ 'MACCSFP21': ('[#6]=[#6](~[!#6;!#1])~[!#6;!#1]', 0),
25
+ 'MACCSFP22': ('*1~*~*~1', 0),
26
+ 'MACCSFP23': ('[#7]~[#6](~[#8])~[#8]', 0),
27
+ 'MACCSFP24': ('[#7]-[#8]', 0),
28
+ 'MACCSFP25': ('[#7]~[#6](~[#7])~[#7]', 0),
29
+ 'MACCSFP26': ('[#6]=;@[#6](@*)@*', 0),
30
+ 'MACCSFP27': ('[I]', 0),
31
+ 'MACCSFP28': ('[!#6;!#1]~[CH2]~[!#6;!#1]', 0),
32
+ 'MACCSFP29': ('[#15]', 0),
33
+ 'MACCSFP30': ('[#6]~[!#6;!#1](~[#6])(~[#6])~*', 0),
34
+ 'MACCSFP31': ('[!#6;!#1]~[F,Cl,Br,I]', 0),
35
+ 'MACCSFP32': ('[#6]~[#16]~[#7]', 0),
36
+ 'MACCSFP33': ('[#7]~[#16]', 0),
37
+ 'MACCSFP34': ('[CH2]=*', 0),
38
+ 'MACCSFP35': ('[Li,Na,K,Rb,Cs,Fr]', 0),
39
+ 'MACCSFP36': ('[#16R]', 0),
40
+ 'MACCSFP37': ('[#7]~[#6](~[#8])~[#7]', 0),
41
+ 'MACCSFP38': ('[#7]~[#6](~[#6])~[#7]', 0),
42
+ 'MACCSFP39': ('[#8]~[#16](~[#8])~[#8]', 0),
43
+ 'MACCSFP40': ('[#16]-[#8]', 0),
44
+ 'MACCSFP41': ('[#6]#[#7]', 0),
45
+ 'MACCSFP42': ('F', 0),
46
+ 'MACCSFP43': ('[!#6;!#1;!H0]~*~[!#6;!#1;!H0]', 0),
47
+ 'MACCSFP44': ('?', 0),
48
+ 'MACCSFP45': ('[#6]=[#6]~[#7]', 0),
49
+ 'MACCSFP46': ('Br', 0),
50
+ 'MACCSFP47': ('[#16]~*~[#7]', 0),
51
+ 'MACCSFP48': ('[#8]~[!#6;!#1](~[#8])(~[#8])', 0),
52
+ 'MACCSFP49': ('[!+0]', 0),
53
+ 'MACCSFP50': ('[#6]=[#6](~[#6])~[#6]', 0),
54
+ 'MACCSFP51': ('[#6]~[#16]~[#8]', 0),
55
+ 'MACCSFP52': ('[#7]~[#7]', 0),
56
+ 'MACCSFP53': ('[!#6;!#1;!H0]~*~*~*~[!#6;!#1;!H0]', 0),
57
+ 'MACCSFP54': ('[!#6;!#1;!H0]~*~*~[!#6;!#1;!H0]', 0),
58
+ 'MACCSFP55': ('[#8]~[#16]~[#8]', 0),
59
+ 'MACCSFP56': ('[#8]~[#7](~[#8])~[#6]', 0),
60
+ 'MACCSFP57': ('[#8R]', 0),
61
+ 'MACCSFP58': ('[!#6;!#1]~[#16]~[!#6;!#1]', 0),
62
+ 'MACCSFP59': ('[#16]!:*:*', 0),
63
+ 'MACCSFP60': ('[#16]=[#8]', 0),
64
+ 'MACCSFP61': ('*~[#16](~*)~*', 0),
65
+ 'MACCSFP62': ('*@*!@*@*', 0),
66
+ 'MACCSFP63': ('[#7]=[#8]', 0),
67
+ 'MACCSFP64': ('*@*!@[#16]', 0),
68
+ 'MACCSFP65': ('c:n', 0),
69
+ 'MACCSFP66': ('[#6]~[#6](~[#6])(~[#6])~*', 0),
70
+ 'MACCSFP67': ('[!#6;!#1]~[#16]', 0),
71
+ 'MACCSFP68': ('[!#6;!#1;!H0]~[!#6;!#1;!H0]', 0),
72
+ 'MACCSFP69': ('[!#6;!#1]~[!#6;!#1;!H0]', 0),
73
+ 'MACCSFP70': ('[!#6;!#1]~[#7]~[!#6;!#1]', 0),
74
+ 'MACCSFP71': ('[#7]~[#8]', 0),
75
+ 'MACCSFP72': ('[#8]~*~*~[#8]', 0),
76
+ 'MACCSFP73': ('[#16]=*', 0),
77
+ 'MACCSFP74': ('[CH3]~*~[CH3]', 0),
78
+ 'MACCSFP75': ('*!@[#7]@*', 0),
79
+ 'MACCSFP76': ('[#6]=[#6](~*)~*', 0),
80
+ 'MACCSFP77': ('[#7]~*~[#7]', 0),
81
+ 'MACCSFP78': ('[#6]=[#7]', 0),
82
+ 'MACCSFP79': ('[#7]~*~*~[#7]', 0),
83
+ 'MACCSFP80': ('[#7]~*~*~*~[#7]', 0),
84
+ 'MACCSFP81': ('[#16]~*(~*)~*', 0),
85
+ 'MACCSFP82': ('*~[CH2]~[!#6;!#1;!H0]', 0),
86
+ 'MACCSFP83': ('[!#6;!#1]1~*~*~*~*~1', 0),
87
+ 'MACCSFP84': ('[NH2]', 0),
88
+ 'MACCSFP85': ('[#6]~[#7](~[#6])~[#6]', 0),
89
+ 'MACCSFP86': ('[C;H2,H3][!#6;!#1][C;H2,H3]', 0),
90
+ 'MACCSFP87': ('[F,Cl,Br,I]!@*@*', 0),
91
+ 'MACCSFP88': ('[#16]', 0),
92
+ 'MACCSFP89': ('[#8]~*~*~*~[#8]', 0),
93
+ 'MACCSFP90': (
94
+ '[$([!#6;!#1;!H0]~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[CH2;R]1)]',
95
+ 0),
96
+ 'MACCSFP91': (
97
+ '[$([!#6;!#1;!H0]~*~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~*~[R]1@[R]@[CH2;R]1)]',
98
+ 0),
99
+ 'MACCSFP92': ('[#8]~[#6](~[#7])~[#6]', 0),
100
+ 'MACCSFP93': ('[!#6;!#1]~[CH3]', 0),
101
+ 'MACCSFP94': ('[!#6;!#1]~[#7]', 0),
102
+ 'MACCSFP95': ('[#7]~*~*~[#8]', 0),
103
+ 'MACCSFP96': ('*1~*~*~*~*~1', 0),
104
+ 'MACCSFP97': ('[#7]~*~*~*~[#8]', 0),
105
+ 'MACCSFP98': ('[!#6;!#1]1~*~*~*~*~*~1', 0),
106
+ 'MACCSFP99': ('[#6]=[#6]', 0),
107
+ 'MACCSFP100': ('*~[CH2]~[#7]', 0),
108
+ 'MACCSFP101': (
109
+ '[$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1)]',
110
+ 0),
111
+ 'MACCSFP102': ('[!#6;!#1]~[#8]', 0),
112
+ 'MACCSFP103': ('Cl', 0),
113
+ 'MACCSFP104': ('[!#6;!#1;!H0]~*~[CH2]~*', 0),
114
+ 'MACCSFP105': ('*@*(@*)@*', 0),
115
+ 'MACCSFP106': ('[!#6;!#1]~*(~[!#6;!#1])~[!#6;!#1]', 0),
116
+ 'MACCSFP107': ('[F,Cl,Br,I]~*(~*)~*', 0),
117
+ 'MACCSFP108': ('[CH3]~*~*~*~[CH2]~*', 0),
118
+ 'MACCSFP109': ('*~[CH2]~[#8]', 0),
119
+ 'MACCSFP110': ('[#7]~[#6]~[#8]', 0),
120
+ 'MACCSFP111': ('[#7]~*~[CH2]~*', 0),
121
+ 'MACCSFP112': ('*~*(~*)(~*)~*', 0),
122
+ 'MACCSFP113': ('[#8]!:*:*', 0),
123
+ 'MACCSFP114': ('[CH3]~[CH2]~*', 0),
124
+ 'MACCSFP115': ('[CH3]~*~[CH2]~*', 0),
125
+ 'MACCSFP116': ('[$([CH3]~*~*~[CH2]~*),$([CH3]~*1~*~[CH2]1)]', 0),
126
+ 'MACCSFP117': ('[#7]~*~[#8]', 0),
127
+ 'MACCSFP118': ('[$(*~[CH2]~[CH2]~*),$(*1~[CH2]~[CH2]1)]', 1),
128
+ 'MACCSFP119': ('[#7]=*', 0),
129
+ 'MACCSFP120': ('[!#6;R]', 1),
130
+ 'MACCSFP121': ('[#7;R]', 0),
131
+ 'MACCSFP122': ('*~[#7](~*)~*', 0),
132
+ 'MACCSFP123': ('[#8]~[#6]~[#8]', 0),
133
+ 'MACCSFP124': ('[!#6;!#1]~[!#6;!#1]', 0),
134
+ 'MACCSFP125': ('?', 0),
135
+ 'MACCSFP126': ('*!@[#8]!@*', 0),
136
+ 'MACCSFP127': ('*@*!@[#8]', 1),
137
+ 'MACCSFP128': (
138
+ '[$(*~[CH2]~*~*~*~[CH2]~*),$([R]1@[CH2;R]@[R]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[R]@[CH2;R]1),$(*~[CH2]~*~[R]1@[R]@[CH2;R]1)]',
139
+ 0),
140
+ 'MACCSFP129': ('[$(*~[CH2]~*~*~[CH2]~*),$([R]1@[CH2]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[CH2;R]1)]',
141
+ 0),
142
+ 'MACCSFP130': ('[!#6;!#1]~[!#6;!#1]', 1),
143
+ 'MACCSFP131': ('[!#6;!#1;!H0]', 1),
144
+ 'MACCSFP132': ('[#8]~*~[CH2]~*', 0),
145
+ 'MACCSFP133': ('*@*!@[#7]', 0),
146
+ 'MACCSFP134': ('[F,Cl,Br,I]', 0),
147
+ 'MACCSFP135': ('[#7]!:*:*', 0),
148
+ 'MACCSFP136': ('[#8]=*', 1),
149
+ 'MACCSFP137': ('[!C;!c;R]', 0),
150
+ 'MACCSFP138': ('[!#6;!#1]~[CH2]~*', 1),
151
+ 'MACCSFP139': ('[O;!H0]', 0),
152
+ 'MACCSFP140': ('[#8]', 3),
153
+ 'MACCSFP141': ('[CH3]', 2),
154
+ 'MACCSFP142': ('[#7]', 1),
155
+ 'MACCSFP143': ('*@*!@[#8]', 0),
156
+ 'MACCSFP144': ('*!:*:*!:*', 0),
157
+ 'MACCSFP145': ('*1~*~*~*~*~*~1', 1),
158
+ 'MACCSFP146': ('[#8]', 2),
159
+ 'MACCSFP147': ('[$(*~[CH2]~[CH2]~*),$([R]1@[CH2;R]@[CH2;R]1)]', 0),
160
+ 'MACCSFP148': ('*~[!#6;!#1](~*)~*', 0),
161
+ 'MACCSFP149': ('[C;H3,H4]', 1),
162
+ 'MACCSFP150': ('*!@*@*!@*', 0),
163
+ 'MACCSFP151': ('[#7;!H0]', 0),
164
+ 'MACCSFP152': ('[#8]~[#6](~[#6])~[#6]', 0),
165
+ 'MACCSFP153': ('[!#6;!#1]~[CH2]~*', 0),
166
+ 'MACCSFP154': ('[#6]=[#8]', 0),
167
+ 'MACCSFP155': ('*!@[CH2]!@*', 0),
168
+ 'MACCSFP156': ('[#7]~*(~*)~*', 0),
169
+ 'MACCSFP157': ('[#6]-[#8]', 0),
170
+ 'MACCSFP158': ('[#6]-[#7]', 0),
171
+ 'MACCSFP159': ('[#8]', 1),
172
+ 'MACCSFP160': ('[C;H3,H4]', 0),
173
+ 'MACCSFP161': ('[#7]', 0),
174
+ 'MACCSFP162': ('a', 0),
175
+ 'MACCSFP163': ('*1~*~*~*~*~*~1', 0),
176
+ 'MACCSFP164': ('[#8]', 0),
177
+ 'MACCSFP165': ('[R]', 0),
178
+ 'MACCSFP166': ('?', 0)}
deepscreen/data/featurizers/fingerprint/smarts_pharmacophore.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Donor = ["[N;!H0;v3,v4&+1]", "[O,S;H1;+0]", "[n&H1&+0]"]
2
+
3
+ Acceptor = ["[O,S;H1;v2;!$(*-*=[O,N,P,S])]", "[O;H0;v2]", "[O,S;v1;-]",
4
+ "[N;v3;!$(N-*=[O,N,P,S])]", "[n&H0&+0]", "[o;+0;!$([o]:n);!$([o]:c:n)]"]
5
+
6
+ Positive = ["[#7;+]", "[N;H2&+0][$([C,a]);!$([C,a](=O))]",
7
+ "[N;H1&+0]([$([C,a]);!$([C,a](=O))])[$([C,a]);!$([C,a](=O))]",
8
+ "[N;H0&+0]([C;!$(C(=O))])([C;!$(C(=O))])[C;!$(C(=O))]"]
9
+
10
+ Negative = ["[C,S](=[O,S,P])-[O;H1,-1]"]
11
+
12
+ Hydrophobic = ["[C;D3,D4](-[CH3])-[CH3]", "[S;D2](-C)-C"]
13
+
14
+ Aromatic = ["a"]
15
+
16
+ pharmacophore_smarts = {"Donor": Donor,
17
+ "Acceptor": Acceptor,
18
+ "Positive": Positive,
19
+ "Negative": Negative,
20
+ "Hydrophobic": Hydrophobic,
21
+ "Aromatic": Aromatic}
deepscreen/data/featurizers/fingerprint/smarts_pubchem.py ADDED
@@ -0,0 +1,734 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ smartsPatts = {
2
+ 'PubChemFP0': ('[H]', 3),
3
+ 'PubChemFP1': ('[H]', 7),
4
+ 'PubChemFP2': ('[H]', 15),
5
+ 'PubChemFP3': ('[H]', 31),
6
+ 'PubChemFP4': ('[Li]', 0),
7
+ 'PubChemFP5': ('[Li]', 1),
8
+ 'PubChemFP6': ('[B]', 0),
9
+ 'PubChemFP7': ('[B]', 1),
10
+ 'PubChemFP8': ('[B]', 3),
11
+ 'PubChemFP9': ('[C]', 1),
12
+ 'PubChemFP10': ('[C]', 3),
13
+ 'PubChemFP11': ('[C]', 7),
14
+ 'PubChemFP12': ('[C]', 15),
15
+ 'PubChemFP13': ('[C]', 31),
16
+ 'PubChemFP14': ('[N]', 0),
17
+ 'PubChemFP15': ('[N]', 1),
18
+ 'PubChemFP16': ('[N]', 3),
19
+ 'PubChemFP17': ('[N]', 7),
20
+ 'PubChemFP18': ('[O]', 0),
21
+ 'PubChemFP19': ('[O]', 1),
22
+ 'PubChemFP20': ('[O]', 3),
23
+ 'PubChemFP21': ('[O]', 7),
24
+ 'PubChemFP22': ('[O]', 15),
25
+ 'PubChemFP23': ('[F]', 0),
26
+ 'PubChemFP24': ('[F]', 1),
27
+ 'PubChemFP25': ('[F]', 3),
28
+ 'PubChemFP26': ('[Na]', 0),
29
+ 'PubChemFP27': ('[Na]', 1),
30
+ 'PubChemFP28': ('[Si]', 0),
31
+ 'PubChemFP29': ('[Si]', 1),
32
+ 'PubChemFP30': ('[P]', 0),
33
+ 'PubChemFP31': ('[P]', 1),
34
+ 'PubChemFP32': ('[P]', 3),
35
+ 'PubChemFP33': ('[S]', 0),
36
+ 'PubChemFP34': ('[S]', 1),
37
+ 'PubChemFP35': ('[S]', 3),
38
+ 'PubChemFP36': ('[S]', 7),
39
+ 'PubChemFP37': ('[Cl]', 0),
40
+ 'PubChemFP38': ('[Cl]', 1),
41
+ 'PubChemFP39': ('[Cl]', 3),
42
+ 'PubChemFP40': ('[Cl]', 7),
43
+ 'PubChemFP41': ('[K]', 0),
44
+ 'PubChemFP42': ('[K]', 1),
45
+ 'PubChemFP43': ('[Br]', 0),
46
+ 'PubChemFP44': ('[Br]', 1),
47
+ 'PubChemFP45': ('[Br]', 3),
48
+ 'PubChemFP46': ('[I]', 0),
49
+ 'PubChemFP47': ('[I]', 1),
50
+ 'PubChemFP48': ('[I]', 3),
51
+ 'PubChemFP49': ('[Be]', 0),
52
+ 'PubChemFP50': ('[Mg]', 0),
53
+ 'PubChemFP51': ('[Al]', 0),
54
+ 'PubChemFP52': ('[Ca]', 0),
55
+ 'PubChemFP53': ('[Sc]', 0),
56
+ 'PubChemFP54': ('[Ti]', 0),
57
+ 'PubChemFP55': ('[V]', 0),
58
+ 'PubChemFP56': ('[Cr]', 0),
59
+ 'PubChemFP57': ('[Mn]', 0),
60
+ 'PubChemFP58': ('[Fe]', 0),
61
+ 'PubChemFP59': ('[CO]', 0),
62
+ 'PubChemFP60': ('[Ni]', 0),
63
+ 'PubChemFP61': ('[Cu]', 0),
64
+ 'PubChemFP62': ('[Zn]', 0),
65
+ 'PubChemFP63': ('[Ga]', 0),
66
+ 'PubChemFP64': ('[Ge]', 0),
67
+ 'PubChemFP65': ('[As]', 0),
68
+ 'PubChemFP66': ('[Se]', 0),
69
+ 'PubChemFP67': ('[Kr]', 0),
70
+ 'PubChemFP68': ('[Rb]', 0),
71
+ 'PubChemFP69': ('[Sr]', 0),
72
+ 'PubChemFP70': ('[Y]', 0),
73
+ 'PubChemFP71': ('[Zr]', 0),
74
+ 'PubChemFP72': ('[Nb]', 0),
75
+ 'PubChemFP73': ('[Mo]', 0),
76
+ 'PubChemFP74': ('[Ru]', 0),
77
+ 'PubChemFP75': ('[Rh]', 0),
78
+ 'PubChemFP76': ('[Pd]', 0),
79
+ 'PubChemFP77': ('[Ag]', 0),
80
+ 'PubChemFP78': ('[Cd]', 0),
81
+ 'PubChemFP79': ('[In]', 0),
82
+ 'PubChemFP80': ('[Sn]', 0),
83
+ 'PubChemFP81': ('[Sb]', 0),
84
+ 'PubChemFP82': ('[Te]', 0),
85
+ 'PubChemFP83': ('[Xe]', 0),
86
+ 'PubChemFP84': ('[Cs]', 0),
87
+ 'PubChemFP85': ('[Ba]', 0),
88
+ 'PubChemFP86': ('[Lu]', 0),
89
+ 'PubChemFP87': ('[Hf]', 0),
90
+ 'PubChemFP88': ('[Ta]', 0),
91
+ 'PubChemFP89': ('[W]', 0),
92
+ 'PubChemFP90': ('[Re]', 0),
93
+ 'PubChemFP91': ('[Os]', 0),
94
+ 'PubChemFP92': ('[Ir]', 0),
95
+ 'PubChemFP93': ('[Pt]', 0),
96
+ 'PubChemFP94': ('[Au]', 0),
97
+ 'PubChemFP95': ('[Hg]', 0),
98
+ 'PubChemFP96': ('[Tl]', 0),
99
+ 'PubChemFP97': ('[Pb]', 0),
100
+ 'PubChemFP98': ('[Bi]', 0),
101
+ 'PubChemFP99': ('[La]', 0),
102
+ 'PubChemFP100': ('[Ce]', 0),
103
+ 'PubChemFP101': ('[Pr]', 0),
104
+ 'PubChemFP102': ('[Nd]', 0),
105
+ 'PubChemFP103': ('[Pm]', 0),
106
+ 'PubChemFP104': ('[Sm]', 0),
107
+ 'PubChemFP105': ('[Eu]', 0),
108
+ 'PubChemFP106': ('[Gd]', 0),
109
+ 'PubChemFP107': ('[Tb]', 0),
110
+ 'PubChemFP108': ('[Dy]', 0),
111
+ 'PubChemFP109': ('[Ho]', 0),
112
+ 'PubChemFP110': ('[Er]', 0),
113
+ 'PubChemFP111': ('[Tm]', 0),
114
+ 'PubChemFP112': ('[Yb]', 0),
115
+ 'PubChemFP113': ('[Tc]', 0),
116
+ 'PubChemFP114': ('[U]', 0),
117
+ 'PubChemFP263': ('[Li&!H0]', 0),
118
+ 'PubChemFP264': ('[Li]~[Li]', 0),
119
+ 'PubChemFP265': ('[Li]~[#5]', 0),
120
+ 'PubChemFP266': ('[Li]~[#6]', 0),
121
+ 'PubChemFP267': ('[Li]~[#8]', 0),
122
+ 'PubChemFP268': ('[Li]~[F]', 0),
123
+ 'PubChemFP269': ('[Li]~[#15]', 0),
124
+ 'PubChemFP270': ('[Li]~[#16]', 0),
125
+ 'PubChemFP271': ('[Li]~[Cl]', 0),
126
+ 'PubChemFP272': ('[#5&!H0]', 0),
127
+ 'PubChemFP273': ('[#5]~[#5]', 0),
128
+ 'PubChemFP274': ('[#5]~[#6]', 0),
129
+ 'PubChemFP275': ('[#5]~[#7]', 0),
130
+ 'PubChemFP276': ('[#5]~[#8]', 0),
131
+ 'PubChemFP277': ('[#5]~[F]', 0),
132
+ 'PubChemFP278': ('[#5]~[#14]', 0),
133
+ 'PubChemFP279': ('[#5]~[#15]', 0),
134
+ 'PubChemFP280': ('[#5]~[#16]', 0),
135
+ 'PubChemFP281': ('[#5]~[Cl]', 0),
136
+ 'PubChemFP282': ('[#5]~[Br]', 0),
137
+ 'PubChemFP283': ('[#6&!H0]', 0),
138
+ 'PubChemFP284': ('[#6]~[#6]', 0),
139
+ 'PubChemFP285': ('[#6]~[#7]', 0),
140
+ 'PubChemFP286': ('[#6]~[#8]', 0),
141
+ 'PubChemFP287': ('[#6]~[F]', 0),
142
+ 'PubChemFP288': ('[#6]~[Na]', 0),
143
+ 'PubChemFP289': ('[#6]~[Mg]', 0),
144
+ 'PubChemFP290': ('[#6]~[Al]', 0),
145
+ 'PubChemFP291': ('[#6]~[#14]', 0),
146
+ 'PubChemFP292': ('[#6]~[#15]', 0),
147
+ 'PubChemFP293': ('[#6]~[#16]', 0),
148
+ 'PubChemFP294': ('[#6]~[Cl]', 0),
149
+ 'PubChemFP295': ('[#6]~[#33]', 0),
150
+ 'PubChemFP296': ('[#6]~[#34]', 0),
151
+ 'PubChemFP297': ('[#6]~[Br]', 0),
152
+ 'PubChemFP298': ('[#6]~[I]', 0),
153
+ 'PubChemFP299': ('[#7&!H0]', 0),
154
+ 'PubChemFP300': ('[#7]~[#7]', 0),
155
+ 'PubChemFP301': ('[#7]~[#8]', 0),
156
+ 'PubChemFP302': ('[#7]~[F]', 0),
157
+ 'PubChemFP303': ('[#7]~[#14]', 0),
158
+ 'PubChemFP304': ('[#7]~[#15]', 0),
159
+ 'PubChemFP305': ('[#7]~[#16]', 0),
160
+ 'PubChemFP306': ('[#7]~[Cl]', 0),
161
+ 'PubChemFP307': ('[#7]~[Br]', 0),
162
+ 'PubChemFP308': ('[#8&!H0]', 0),
163
+ 'PubChemFP309': ('[#8]~[#8]', 0),
164
+ 'PubChemFP310': ('[#8]~[Mg]', 0),
165
+ 'PubChemFP311': ('[#8]~[Na]', 0),
166
+ 'PubChemFP312': ('[#8]~[Al]', 0),
167
+ 'PubChemFP313': ('[#8]~[#14]', 0),
168
+ 'PubChemFP314': ('[#8]~[#15]', 0),
169
+ 'PubChemFP315': ('[#8]~[K]', 0),
170
+ 'PubChemFP316': ('[F]~[#15]', 0),
171
+ 'PubChemFP317': ('[F]~[#16]', 0),
172
+ 'PubChemFP318': ('[Al&!H0]', 0),
173
+ 'PubChemFP319': ('[Al]~[Cl]', 0),
174
+ 'PubChemFP320': ('[#14&!H0]', 0),
175
+ 'PubChemFP321': ('[#14]~[#14]', 0),
176
+ 'PubChemFP322': ('[#14]~[Cl]', 0),
177
+ 'PubChemFP323': ('[#15&!H0]', 0),
178
+ 'PubChemFP324': ('[#15]~[#15]', 0),
179
+ 'PubChemFP325': ('[#33&!H0]', 0),
180
+ 'PubChemFP326': ('[#33]~[#33]', 0),
181
+ 'PubChemFP327': ('[#6](~Br)(~[#6])', 0),
182
+ 'PubChemFP328': ('[#6](~Br)(~[#6])(~[#6])', 0),
183
+ 'PubChemFP329': ('[#6&!H0]~[Br]', 0),
184
+ 'PubChemFP330': ('[#6](~[Br])(:[c])', 0),
185
+ 'PubChemFP331': ('[#6](~[Br])(:[n])', 0),
186
+ 'PubChemFP332': ('[#6](~[#6])(~[#6])', 0),
187
+ 'PubChemFP333': ('[#6](~[#6])(~[#6])(~[#6])', 0),
188
+ 'PubChemFP334': ('[#6](~[#6])(~[#6])(~[#6])(~[#6])', 0),
189
+ 'PubChemFP335': ('[#6H1](~[#6])(~[#6])(~[#6])', 0),
190
+ 'PubChemFP336': ('[#6](~[#6])(~[#6])(~[#6])(~[#7])', 0),
191
+ 'PubChemFP337': ('[#6](~[#6])(~[#6])(~[#6])(~[#8])', 0),
192
+ 'PubChemFP338': ('[#6H1](~[#6])(~[#6])(~[#7])', 0),
193
+ 'PubChemFP339': ('[#6H1](~[#6])(~[#6])(~[#8])', 0),
194
+ 'PubChemFP340': ('[#6](~[#6])(~[#6])(~[#7])', 0),
195
+ 'PubChemFP341': ('[#6](~[#6])(~[#6])(~[#8])', 0),
196
+ 'PubChemFP342': ('[#6](~[#6])(~[Cl])', 0),
197
+ 'PubChemFP343': ('[#6&!H0](~[#6])(~[Cl])', 0),
198
+ 'PubChemFP344': ('[#6H,#6H2,#6H3,#6H4]~[#6]', 0),
199
+ 'PubChemFP345': ('[#6&!H0](~[#6])(~[#7])', 0),
200
+ 'PubChemFP346': ('[#6&!H0](~[#6])(~[#8])', 0),
201
+ 'PubChemFP347': ('[#6H1](~[#6])(~[#8])(~[#8])', 0),
202
+ 'PubChemFP348': ('[#6&!H0](~[#6])(~[#15])', 0),
203
+ 'PubChemFP349': ('[#6&!H0](~[#6])(~[#16])', 0),
204
+ 'PubChemFP350': ('[#6](~[#6])(~[I])', 0),
205
+ 'PubChemFP351': ('[#6](~[#6])(~[#7])', 0),
206
+ 'PubChemFP352': ('[#6](~[#6])(~[#8])', 0),
207
+ 'PubChemFP353': ('[#6](~[#6])(~[#16])', 0),
208
+ 'PubChemFP354': ('[#6](~[#6])(~[#14])', 0),
209
+ 'PubChemFP355': ('[#6](~[#6])(:c)', 0),
210
+ 'PubChemFP356': ('[#6](~[#6])(:c)(:c)', 0),
211
+ 'PubChemFP357': ('[#6](~[#6])(:c)(:n)', 0),
212
+ 'PubChemFP358': ('[#6](~[#6])(:n)', 0),
213
+ 'PubChemFP359': ('[#6](~[#6])(:n)(:n)', 0),
214
+ 'PubChemFP360': ('[#6](~[Cl])(~[Cl])', 0),
215
+ 'PubChemFP361': ('[#6&!H0](~[Cl])', 0),
216
+ 'PubChemFP362': ('[#6](~[Cl])(:c)', 0),
217
+ 'PubChemFP363': ('[#6](~[F])(~[F])', 0),
218
+ 'PubChemFP364': ('[#6](~[F])(:c)', 0),
219
+ 'PubChemFP365': ('[#6&!H0](~[#7])', 0),
220
+ 'PubChemFP366': ('[#6&!H0](~[#8])', 0),
221
+ 'PubChemFP367': ('[#6&!H0](~[#8])(~[#8])', 0),
222
+ 'PubChemFP368': ('[#6&!H0](~[#16])', 0),
223
+ 'PubChemFP369': ('[#6&!H0](~[#14])', 0),
224
+ 'PubChemFP370': ('[#6&!H0]:c', 0),
225
+ 'PubChemFP371': ('[#6&!H0](:c)(:c)', 0),
226
+ 'PubChemFP372': ('[#6&!H0](:c)(:n)', 0),
227
+ 'PubChemFP373': ('[#6&!H0](:n)', 0),
228
+ 'PubChemFP374': ('[#6H3]', 0),
229
+ 'PubChemFP375': ('[#6](~[#7])(~[#7])', 0),
230
+ 'PubChemFP376': ('[#6](~[#7])(:c)', 0),
231
+ 'PubChemFP377': ('[#6](~[#7])(:c)(:c)', 0),
232
+ 'PubChemFP378': ('[#6](~[#7])(:c)(:n)', 0),
233
+ 'PubChemFP379': ('[#6](~[#7])(:n)', 0),
234
+ 'PubChemFP380': ('[#6](~[#8])(~[#8])', 0),
235
+ 'PubChemFP381': ('[#6](~[#8])(:c)', 0),
236
+ 'PubChemFP382': ('[#6](~[#8])(:c)(:c)', 0),
237
+ 'PubChemFP383': ('[#6](~[#16])(:c)', 0),
238
+ 'PubChemFP384': ('[#6](:c)(:c)', 0),
239
+ 'PubChemFP385': ('[#6](:c)(:c)(:c)', 0),
240
+ 'PubChemFP386': ('[#6](:c)(:c)(:n)', 0),
241
+ 'PubChemFP387': ('[#6](:c)(:n)', 0),
242
+ 'PubChemFP388': ('[#6](:c)(:n)(:n)', 0),
243
+ 'PubChemFP389': ('[#6](:n)(:n)', 0),
244
+ 'PubChemFP390': ('[#7](~[#6])(~[#6])', 0),
245
+ 'PubChemFP391': ('[#7](~[#6])(~[#6])(~[#6])', 0),
246
+ 'PubChemFP392': ('[#7&!H0](~[#6])(~[#6])', 0),
247
+ 'PubChemFP393': ('[#7&!H0](~[#6])', 0),
248
+ 'PubChemFP394': ('[#7&!H0](~[#6])(~[#7])', 0),
249
+ 'PubChemFP395': ('[#7](~[#6])(~[#8])', 0),
250
+ 'PubChemFP396': ('[#7](~[#6])(:c)', 0),
251
+ 'PubChemFP397': ('[#7](~[#6])(:c)(:c)', 0),
252
+ 'PubChemFP398': ('[#7&!H0](~[#7])', 0),
253
+ 'PubChemFP399': ('[#7&!H0](:c)', 0),
254
+ 'PubChemFP400': ('[#7&!H0](:c)(:c)', 0),
255
+ 'PubChemFP401': ('[#7](~[#8])(~[#8])', 0),
256
+ 'PubChemFP402': ('[#7](~[#8])(:o)', 0),
257
+ 'PubChemFP403': ('[#7](:c)(:c)', 0),
258
+ 'PubChemFP404': ('[#7](:c)(:c)(:c)', 0),
259
+ 'PubChemFP405': ('[#8](~[#6])(~[#6])', 0),
260
+ 'PubChemFP406': ('[#8&!H0](~[#6])', 0),
261
+ 'PubChemFP407': ('[#8](~[#6])(~[#15])', 0),
262
+ 'PubChemFP408': ('[#8&!H0](~[#16])', 0),
263
+ 'PubChemFP409': ('[#8](:c)(:c)', 0),
264
+ 'PubChemFP410': ('[#15](~[#6])(~[#6])', 0),
265
+ 'PubChemFP411': ('[#15](~[#8])(~[#8])', 0),
266
+ 'PubChemFP412': ('[#16](~[#6])(~[#6])', 0),
267
+ 'PubChemFP413': ('[#16&!H0](~[#6])', 0),
268
+ 'PubChemFP414': ('[#16](~[#6])(~[#8])', 0),
269
+ 'PubChemFP415': ('[#14](~[#6])(~[#6])', 0),
270
+ 'PubChemFP416': ('[#6]=,:[#6]', 0),
271
+ 'PubChemFP417': ('[#6]#[#6]', 0),
272
+ 'PubChemFP418': ('[#6]=,:[#7]', 0),
273
+ 'PubChemFP419': ('[#6]#[#7]', 0),
274
+ 'PubChemFP420': ('[#6]=,:[#8]', 0),
275
+ 'PubChemFP421': ('[#6]=,:[#16]', 0),
276
+ 'PubChemFP422': ('[#7]=,:[#7]', 0),
277
+ 'PubChemFP423': ('[#7]=,:[#8]', 0),
278
+ 'PubChemFP424': ('[#7]=,:[#15]', 0),
279
+ 'PubChemFP425': ('[#15]=,:[#8]', 0),
280
+ 'PubChemFP426': ('[#15]=,:[#15]', 0),
281
+ 'PubChemFP427': ('[#6](#[#6])(-,:[#6])', 0),
282
+ 'PubChemFP428': ('[#6&!H0](#[#6])', 0),
283
+ 'PubChemFP429': ('[#6](#[#7])(-,:[#6])', 0),
284
+ 'PubChemFP430': ('[#6](-,:[#6])(-,:[#6])(=,:[#6])', 0),
285
+ 'PubChemFP431': ('[#6](-,:[#6])(-,:[#6])(=,:[#7])', 0),
286
+ 'PubChemFP432': ('[#6](-,:[#6])(-,:[#6])(=,:[#8])', 0),
287
+ 'PubChemFP433': ('[#6](-,:[#6])([Cl])(=,:[#8])', 0),
288
+ 'PubChemFP434': ('[#6&!H0](-,:[#6])(=,:[#6])', 0),
289
+ 'PubChemFP435': ('[#6&!H0](-,:[#6])(=,:[#7])', 0),
290
+ 'PubChemFP436': ('[#6&!H0](-,:[#6])(=,:[#8])', 0),
291
+ 'PubChemFP437': ('[#6](-,:[#6])(-,:[#7])(=,:[#6])', 0),
292
+ 'PubChemFP438': ('[#6](-,:[#6])(-,:[#7])(=,:[#7])', 0),
293
+ 'PubChemFP439': ('[#6](-,:[#6])(-,:[#7])(=,:[#8])', 0),
294
+ 'PubChemFP440': ('[#6](-,:[#6])(-,:[#8])(=,:[#8])', 0),
295
+ 'PubChemFP441': ('[#6](-,:[#6])(=,:[#6])', 0),
296
+ 'PubChemFP442': ('[#6](-,:[#6])(=,:[#7])', 0),
297
+ 'PubChemFP443': ('[#6](-,:[#6])(=,:[#8])', 0),
298
+ 'PubChemFP444': ('[#6]([Cl])(=,:[#8])', 0),
299
+ 'PubChemFP445': ('[#6&!H0](-,:[#7])(=,:[#6])', 0),
300
+ 'PubChemFP446': ('[#6&!H0](=,:[#6])', 0),
301
+ 'PubChemFP447': ('[#6&!H0](=,:[#7])', 0),
302
+ 'PubChemFP448': ('[#6&!H0](=,:[#8])', 0),
303
+ 'PubChemFP449': ('[#6](-,:[#7])(=,:[#6])', 0),
304
+ 'PubChemFP450': ('[#6](-,:[#7])(=,:[#7])', 0),
305
+ 'PubChemFP451': ('[#6](-,:[#7])(=,:[#8])', 0),
306
+ 'PubChemFP452': ('[#6](-,:[#8])(=,:[#8])', 0),
307
+ 'PubChemFP453': ('[#7](-,:[#6])(=,:[#6])', 0),
308
+ 'PubChemFP454': ('[#7](-,:[#6])(=,:[#8])', 0),
309
+ 'PubChemFP455': ('[#7](-,:[#8])(=,:[#8])', 0),
310
+ 'PubChemFP456': ('[#15](-,:[#8])(=,:[#8])', 0),
311
+ 'PubChemFP457': ('[#16](-,:[#6])(=,:[#8])', 0),
312
+ 'PubChemFP458': ('[#16](-,:[#8])(=,:[#8])', 0),
313
+ 'PubChemFP459': ('[#16](=,:[#8])(=,:[#8])', 0),
314
+ 'PubChemFP460': ('[#6]-,:[#6]-,:[#6]#[#6]', 0),
315
+ 'PubChemFP461': ('[#8]-,:[#6]-,:[#6]=,:[#7]', 0),
316
+ 'PubChemFP462': ('[#8]-,:[#6]-,:[#6]=,:[#8]', 0),
317
+ 'PubChemFP463': ('[#7]:[#6]-,:[#16&!H0]', 0),
318
+ 'PubChemFP464': ('[#7]-,:[#6]-,:[#6]=,:[#6]', 0),
319
+ 'PubChemFP465': ('[#8]=,:[#16]-,:[#6]-,:[#6]', 0),
320
+ 'PubChemFP466': ('[#7]#[#6]-,:[#6]=,:[#6]', 0),
321
+ 'PubChemFP467': ('[#6]=,:[#7]-,:[#7]-,:[#6]', 0),
322
+ 'PubChemFP468': ('[#8]=,:[#16]-,:[#6]-,:[#7]', 0),
323
+ 'PubChemFP469': ('[#16]-,:[#16]-,:[#6]:[#6]', 0),
324
+ 'PubChemFP470': ('[#6]:[#6]-,:[#6]=,:[#6]', 0),
325
+ 'PubChemFP471': ('[#16]:[#6]:[#6]:[#6]', 0),
326
+ 'PubChemFP472': ('[#6]:[#7]:[#6]-,:[#6]', 0),
327
+ 'PubChemFP473': ('[#16]-,:[#6]:[#7]:[#6]', 0),
328
+ 'PubChemFP474': ('[#16]:[#6]:[#6]:[#7]', 0),
329
+ 'PubChemFP475': ('[#16]-,:[#6]=,:[#7]-,:[#6]', 0),
330
+ 'PubChemFP476': ('[#6]-,:[#8]-,:[#6]=,:[#6]', 0),
331
+ 'PubChemFP477': ('[#7]-,:[#7]-,:[#6]:[#6]', 0),
332
+ 'PubChemFP478': ('[#16]-,:[#6]=,:[#7&!H0]', 0),
333
+ 'PubChemFP479': ('[#16]-,:[#6]-,:[#16]-,:[#6]', 0),
334
+ 'PubChemFP480': ('[#6]:[#16]:[#6]-,:[#6]', 0),
335
+ 'PubChemFP481': ('[#8]-,:[#16]-,:[#6]:[#6]', 0),
336
+ 'PubChemFP482': ('[#6]:[#7]-,:[#6]:[#6]', 0),
337
+ 'PubChemFP483': ('[#7]-,:[#16]-,:[#6]:[#6]', 0),
338
+ 'PubChemFP484': ('[#7]-,:[#6]:[#7]:[#6]', 0),
339
+ 'PubChemFP485': ('[#7]:[#6]:[#6]:[#7]', 0),
340
+ 'PubChemFP486': ('[#7]-,:[#6]:[#7]:[#7]', 0),
341
+ 'PubChemFP487': ('[#7]-,:[#6]=,:[#7]-,:[#6]', 0),
342
+ 'PubChemFP488': ('[#7]-,:[#6]=,:[#7&!H0]', 0),
343
+ 'PubChemFP489': ('[#7]-,:[#6]-,:[#16]-,:[#6]', 0),
344
+ 'PubChemFP490': ('[#6]-,:[#6]-,:[#6]=,:[#6]', 0),
345
+ 'PubChemFP491': ('[#6]-,:[#7]:[#6&!H0]', 0),
346
+ 'PubChemFP492': ('[#7]-,:[#6]:[#8]:[#6]', 0),
347
+ 'PubChemFP493': ('[#8]=,:[#6]-,:[#6]:[#6]', 0),
348
+ 'PubChemFP494': ('[#8]=,:[#6]-,:[#6]:[#7]', 0),
349
+ 'PubChemFP495': ('[#6]-,:[#7]-,:[#6]:[#6]', 0),
350
+ 'PubChemFP496': ('[#7]:[#7]-,:[#6&!H0]', 0),
351
+ 'PubChemFP497': ('[#8]-,:[#6]:[#6]:[#7]', 0),
352
+ 'PubChemFP498': ('[#8]-,:[#6]=,:[#6]-,:[#6]', 0),
353
+ 'PubChemFP499': ('[#7]-,:[#6]:[#6]:[#7]', 0),
354
+ 'PubChemFP500': ('[#6]-,:[#16]-,:[#6]:[#6]', 0),
355
+ 'PubChemFP501': ('[Cl]-,:[#6]:[#6]-,:[#6]', 0),
356
+ 'PubChemFP502': ('[#7]-,:[#6]=,:[#6&!H0]', 0),
357
+ 'PubChemFP503': ('[Cl]-,:[#6]:[#6&!H0]', 0),
358
+ 'PubChemFP504': ('[#7]:[#6]:[#7]-,:[#6]', 0),
359
+ 'PubChemFP505': ('[Cl]-,:[#6]:[#6]-,:[#8]', 0),
360
+ 'PubChemFP506': ('[#6]-,:[#6]:[#7]:[#6]', 0),
361
+ 'PubChemFP507': ('[#6]-,:[#6]-,:[#16]-,:[#6]', 0),
362
+ 'PubChemFP508': ('[#16]=,:[#6]-,:[#7]-,:[#6]', 0),
363
+ 'PubChemFP509': ('[Br]-,:[#6]:[#6]-,:[#6]', 0),
364
+ 'PubChemFP510': ('[#7&!H0]-,:[#7&!H0]', 0),
365
+ 'PubChemFP511': ('[#16]=,:[#6]-,:[#7&!H0]', 0),
366
+ 'PubChemFP512': ('[#6]-,:[#33]-[#8&!H0]', 0),
367
+ 'PubChemFP513': ('[#16]:[#6]:[#6&!H0]', 0),
368
+ 'PubChemFP514': ('[#8]-,:[#7]-,:[#6]-,:[#6]', 0),
369
+ 'PubChemFP515': ('[#7]-,:[#7]-,:[#6]-,:[#6]', 0),
370
+ 'PubChemFP516': ('[#6H,#6H2,#6H3]=,:[#6H,#6H2,#6H3]', 0),
371
+ 'PubChemFP517': ('[#7]-,:[#7]-,:[#6]-,:[#7]', 0),
372
+ 'PubChemFP518': ('[#8]=,:[#6]-,:[#7]-,:[#7]', 0),
373
+ 'PubChemFP519': ('[#7]=,:[#6]-,:[#7]-,:[#6]', 0),
374
+ 'PubChemFP520': ('[#6]=,:[#6]-,:[#6]:[#6]', 0),
375
+ 'PubChemFP521': ('[#6]:[#7]-,:[#6&!H0]', 0),
376
+ 'PubChemFP522': ('[#6]-,:[#7]-,:[#7&!H0]', 0),
377
+ 'PubChemFP523': ('[#7]:[#6]:[#6]-,:[#6]', 0),
378
+ 'PubChemFP524': ('[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
379
+ 'PubChemFP525': ('[#33]-,:[#6]:[#6&!H0]', 0),
380
+ 'PubChemFP526': ('[Cl]-,:[#6]:[#6]-,:[Cl]', 0),
381
+ 'PubChemFP527': ('[#6]:[#6]:[#7&!H0]', 0),
382
+ 'PubChemFP528': ('[#7&!H0]-,:[#6&!H0]', 0),
383
+ 'PubChemFP529': ('[Cl]-,:[#6]-,:[#6]-,:[Cl]', 0),
384
+ 'PubChemFP530': ('[#7]:[#6]-,:[#6]:[#6]', 0),
385
+ 'PubChemFP531': ('[#16]-,:[#6]:[#6]-,:[#6]', 0),
386
+ 'PubChemFP532': ('[#16]-,:[#6]:[#6&!H0]', 0),
387
+ 'PubChemFP533': ('[#16]-,:[#6]:[#6]-,:[#7]', 0),
388
+ 'PubChemFP534': ('[#16]-,:[#6]:[#6]-,:[#8]', 0),
389
+ 'PubChemFP535': ('[#8]=,:[#6]-,:[#6]-,:[#6]', 0),
390
+ 'PubChemFP536': ('[#8]=,:[#6]-,:[#6]-,:[#7]', 0),
391
+ 'PubChemFP537': ('[#8]=,:[#6]-,:[#6]-,:[#8]', 0),
392
+ 'PubChemFP538': ('[#7]=,:[#6]-,:[#6]-,:[#6]', 0),
393
+ 'PubChemFP539': ('[#7]=,:[#6]-,:[#6&!H0]', 0),
394
+ 'PubChemFP540': ('[#6]-,:[#7]-,:[#6&!H0]', 0),
395
+ 'PubChemFP541': ('[#8]-,:[#6]:[#6]-,:[#6]', 0),
396
+ 'PubChemFP542': ('[#8]-,:[#6]:[#6&!H0]', 0),
397
+ 'PubChemFP543': ('[#8]-,:[#6]:[#6]-,:[#7]', 0),
398
+ 'PubChemFP544': ('[#8]-,:[#6]:[#6]-,:[#8]', 0),
399
+ 'PubChemFP545': ('[#7]-,:[#6]:[#6]-,:[#6]', 0),
400
+ 'PubChemFP546': ('[#7]-,:[#6]:[#6&!H0]', 0),
401
+ 'PubChemFP547': ('[#7]-,:[#6]:[#6]-,:[#7]', 0),
402
+ 'PubChemFP548': ('[#8]-,:[#6]-,:[#6]:[#6]', 0),
403
+ 'PubChemFP549': ('[#7]-,:[#6]-,:[#6]:[#6]', 0),
404
+ 'PubChemFP550': ('[Cl]-,:[#6]-,:[#6]-,:[#6]', 0),
405
+ 'PubChemFP551': ('[Cl]-,:[#6]-,:[#6]-,:[#8]', 0),
406
+ 'PubChemFP552': ('[#6]:[#6]-,:[#6]:[#6]', 0),
407
+ 'PubChemFP553': ('[#8]=,:[#6]-,:[#6]=,:[#6]', 0),
408
+ 'PubChemFP554': ('[Br]-,:[#6]-,:[#6]-,:[#6]', 0),
409
+ 'PubChemFP555': ('[#7]=,:[#6]-,:[#6]=,:[#6]', 0),
410
+ 'PubChemFP556': ('[#6]=,:[#6]-,:[#6]-,:[#6]', 0),
411
+ 'PubChemFP557': ('[#7]:[#6]-,:[#8&!H0]', 0),
412
+ 'PubChemFP558': ('[#8]=,:[#7]-,:c:c', 0),
413
+ 'PubChemFP559': ('[#8]-,:[#6]-,:[#7&!H0]', 0),
414
+ 'PubChemFP560': ('[#7]-,:[#6]-,:[#7]-,:[#6]', 0),
415
+ 'PubChemFP561': ('[Cl]-,:[#6]-,:[#6]=,:[#8]', 0),
416
+ 'PubChemFP562': ('[Br]-,:[#6]-,:[#6]=,:[#8]', 0),
417
+ 'PubChemFP563': ('[#8]-,:[#6]-,:[#8]-,:[#6]', 0),
418
+ 'PubChemFP564': ('[#6]=,:[#6]-,:[#6]=,:[#6]', 0),
419
+ 'PubChemFP565': ('[#6]:[#6]-,:[#8]-,:[#6]', 0),
420
+ 'PubChemFP566': ('[#8]-,:[#6]-,:[#6]-,:[#7]', 0),
421
+ 'PubChemFP567': ('[#8]-,:[#6]-,:[#6]-,:[#8]', 0),
422
+ 'PubChemFP568': ('N#[#6]-,:[#6]-,:[#6]', 0),
423
+ 'PubChemFP569': ('[#7]-,:[#6]-,:[#6]-,:[#7]', 0),
424
+ 'PubChemFP570': ('[#6]:[#6]-,:[#6]-,:[#6]', 0),
425
+ 'PubChemFP571': ('[#6&!H0]-,:[#8&!H0]', 0),
426
+ 'PubChemFP572': ('n:c:n:c', 0),
427
+ 'PubChemFP573': ('[#8]-,:[#6]-,:[#6]=,:[#6]', 0),
428
+ 'PubChemFP574': ('[#8]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
429
+ 'PubChemFP575': ('[#8]-,:[#6]-,:[#6]:[#6]-,:[#8]', 0),
430
+ 'PubChemFP576': ('[#7]=,:[#6]-,:[#6]:[#6&!H0]', 0),
431
+ 'PubChemFP577': ('c:c-,:[#7]-,:c:c', 0),
432
+ 'PubChemFP578': ('[#6]-,:[#6]:[#6]-,:c:c', 0),
433
+ 'PubChemFP579': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
434
+ 'PubChemFP580': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
435
+ 'PubChemFP581': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
436
+ 'PubChemFP582': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
437
+ 'PubChemFP583': ('[Cl]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
438
+ 'PubChemFP584': ('c:c-,:[#6]=,:[#6]-,:[#6]', 0),
439
+ 'PubChemFP585': ('[#6]-,:[#6]:[#6]-,:[#7]-,:[#6]', 0),
440
+ 'PubChemFP586': ('[#6]-,:[#16]-,:[#6]-,:[#6]-,:[#6]', 0),
441
+ 'PubChemFP587': ('[#7]-,:[#6]:[#6]-,:[#8&!H0]', 0),
442
+ 'PubChemFP588': ('[#8]=,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
443
+ 'PubChemFP589': ('[#6]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
444
+ 'PubChemFP590': ('[#6]-,:[#6]:[#6]-,:[#8&!H0]', 0),
445
+ 'PubChemFP591': ('[Cl]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
446
+ 'PubChemFP592': ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
447
+ 'PubChemFP593': ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
448
+ 'PubChemFP594': ('[#6]-,:[#8]-,:[#6]-,:[#6]=,:[#6]', 0),
449
+ 'PubChemFP595': ('c:c-,:[#6]-,:[#6]-,:[#6]', 0),
450
+ 'PubChemFP596': ('[#7]=,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
451
+ 'PubChemFP597': ('[#8]=,:[#6]-,:[#6]-,:c:c', 0),
452
+ 'PubChemFP598': ('[Cl]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
453
+ 'PubChemFP599': ('[#6H,#6H2,#6H3]-,:[#6]=,:[#6H,#6H2,#6H3]', 0),
454
+ 'PubChemFP600': ('[#7]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
455
+ 'PubChemFP601': ('[#7]-,:[#6]:[#6]:[#6]-,:[#7]', 0),
456
+ 'PubChemFP602': ('[#8]=,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
457
+ 'PubChemFP603': ('[#6]-,:c:c:[#6]-,:[#6]', 0),
458
+ 'PubChemFP604': ('[#6]-,:[#8]-,:[#6]-,:[#6]:c', 0),
459
+ 'PubChemFP605': ('[#8]=,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
460
+ 'PubChemFP606': ('[#8]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
461
+ 'PubChemFP607': ('[#7]-,:[#6]-,:[#6]-,:[#6]:c', 0),
462
+ 'PubChemFP608': ('[#6]-,:[#6]-,:[#6]-,:[#6]:c', 0),
463
+ 'PubChemFP609': ('[Cl]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
464
+ 'PubChemFP610': ('[#6]-,:[#8]-,:[#6]-,:[#8]-,:[#6]', 0),
465
+ 'PubChemFP611': ('[#7]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
466
+ 'PubChemFP612': ('[#7]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
467
+ 'PubChemFP613': ('[#6]-,:[#7]-,:[#6]-,:[#6]-,:[#6]', 0),
468
+ 'PubChemFP614': ('[#6]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
469
+ 'PubChemFP615': ('[#7]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
470
+ 'PubChemFP616': ('c:c:n:n:c', 0),
471
+ 'PubChemFP617': ('[#6]-,:[#6]-,:[#6]-,:[#8&!H0]', 0),
472
+ 'PubChemFP618': ('c:[#6]-,:[#6]-,:[#6]:c', 0),
473
+ 'PubChemFP619': ('[#8]-,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
474
+ 'PubChemFP620': ('c:c-,:[#8]-,:[#6]-,:[#6]', 0),
475
+ 'PubChemFP621': ('[#7]-,:[#6]:c:c:n', 0),
476
+ 'PubChemFP622': ('[#8]=,:[#6]-,:[#8]-,:[#6]:c', 0),
477
+ 'PubChemFP623': ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
478
+ 'PubChemFP624': ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#7]', 0),
479
+ 'PubChemFP625': ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#8]', 0),
480
+ 'PubChemFP626': ('[#6]-,:[#8]-,:[#6]:[#6]-,:[#6]', 0),
481
+ 'PubChemFP627': ('[#8]=,:[#33]-,:[#6]:c:c', 0),
482
+ 'PubChemFP628': ('[#6]-,:[#7]-,:[#6]-,:[#6]:c', 0),
483
+ 'PubChemFP629': ('[#16]-,:[#6]:c:c-,:[#7]', 0),
484
+ 'PubChemFP630': ('[#8]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
485
+ 'PubChemFP631': ('[#8]-,:[#6]:[#6]-,:[#8&!H0]', 0),
486
+ 'PubChemFP632': ('[#6]-,:[#6]-,:[#8]-,:[#6]:c', 0),
487
+ 'PubChemFP633': ('[#7]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
488
+ 'PubChemFP634': ('[#6]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
489
+ 'PubChemFP635': ('[#7]-,:[#7]-,:[#6]-,:[#7&!H0]', 0),
490
+ 'PubChemFP636': ('[#6]-,:[#7]-,:[#6]-,:[#7]-,:[#6]', 0),
491
+ 'PubChemFP637': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
492
+ 'PubChemFP638': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
493
+ 'PubChemFP639': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
494
+ 'PubChemFP640': ('[#6]=,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
495
+ 'PubChemFP641': ('[#8]-,:[#6]-,:[#6]-,:[#6]=,:[#6]', 0),
496
+ 'PubChemFP642': ('[#8]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
497
+ 'PubChemFP643': ('[#6&!H0]-,:[#6]-,:[#7&!H0]', 0),
498
+ 'PubChemFP644': ('[#6]-,:[#6]=,:[#7]-,:[#7]-,:[#6]', 0),
499
+ 'PubChemFP645': ('[#8]=,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
500
+ 'PubChemFP646': ('[#8]=,:[#6]-,:[#7]-,:[#6&!H0]', 0),
501
+ 'PubChemFP647': ('[#8]=,:[#6]-,:[#7]-,:[#6]-,:[#7]', 0),
502
+ 'PubChemFP648': ('[#8]=,:[#7]-,:[#6]:[#6]-,:[#7]', 0),
503
+ 'PubChemFP649': ('[#8]=,:[#7]-,:c:c-,:[#8]', 0),
504
+ 'PubChemFP650': ('[#8]=,:[#6]-,:[#7]-,:[#6]=,:[#8]', 0),
505
+ 'PubChemFP651': ('[#8]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
506
+ 'PubChemFP652': ('[#8]-,:[#6]:[#6]:[#6]-,:[#7]', 0),
507
+ 'PubChemFP653': ('[#8]-,:[#6]:[#6]:[#6]-,:[#8]', 0),
508
+ 'PubChemFP654': ('[#7]-,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
509
+ 'PubChemFP655': ('[#8]-,:[#6]-,:[#6]-,:[#6]:c', 0),
510
+ 'PubChemFP656': ('[#6]-,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
511
+ 'PubChemFP657': ('[#6]-,:[#7]-,:[#6]:[#6]-,:[#6]', 0),
512
+ 'PubChemFP658': ('[#6]-,:[#6]-,:[#16]-,:[#6]-,:[#6]', 0),
513
+ 'PubChemFP659': ('[#8]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
514
+ 'PubChemFP660': ('[#6]-,:[#6]=,:[#6]-,:[#6]-,:[#6]', 0),
515
+ 'PubChemFP661': ('[#8]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
516
+ 'PubChemFP662': ('[#8]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
517
+ 'PubChemFP663': ('[#8]-,:[#6]-,:[#6]-,:[#8&!H0]', 0),
518
+ 'PubChemFP664': ('[#6]-,:[#6]=,:[#6]-,:[#6]=,:[#6]', 0),
519
+ 'PubChemFP665': ('[#7]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
520
+ 'PubChemFP666': ('[#6]=,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
521
+ 'PubChemFP667': ('[#6]=,:[#6]-,:[#6]-,:[#8&!H0]', 0),
522
+ 'PubChemFP668': ('[#6]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
523
+ 'PubChemFP669': ('[Cl]-,:[#6]:[#6]-,:[#6]=,:[#8]', 0),
524
+ 'PubChemFP670': ('[Br]-,:[#6]:c:c-,:[#6]', 0),
525
+ 'PubChemFP671': ('[#8]=,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
526
+ 'PubChemFP672': ('[#8]=,:[#6]-,:[#6]=,:[#6&!H0]', 0),
527
+ 'PubChemFP673': ('[#8]=,:[#6]-,:[#6]=,:[#6]-,:[#7]', 0),
528
+ 'PubChemFP674': ('[#7]-,:[#6]-,:[#7]-,:[#6]:c', 0),
529
+ 'PubChemFP675': ('[Br]-,:[#6]-,:[#6]-,:[#6]:c', 0),
530
+ 'PubChemFP676': ('[#7]#[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
531
+ 'PubChemFP677': ('[#6]-,:[#6]=,:[#6]-,:[#6]:c', 0),
532
+ 'PubChemFP678': ('[#6]-,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
533
+ 'PubChemFP679': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
534
+ 'PubChemFP680': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
535
+ 'PubChemFP681': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
536
+ 'PubChemFP682': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
537
+ 'PubChemFP683': ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
538
+ 'PubChemFP684': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
539
+ 'PubChemFP685': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
540
+ 'PubChemFP686': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
541
+ 'PubChemFP687': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
542
+ 'PubChemFP688': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
543
+ 'PubChemFP689': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
544
+ 'PubChemFP690': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
545
+ 'PubChemFP691': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
546
+ 'PubChemFP692': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
547
+ 'PubChemFP693': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
548
+ 'PubChemFP694': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
549
+ 'PubChemFP695': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
550
+ 'PubChemFP696': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
551
+ 'PubChemFP697': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]', 0),
552
+ 'PubChemFP698': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
553
+ 'PubChemFP699': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]', 0),
554
+ 'PubChemFP700': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
555
+ 'PubChemFP701': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#8])-,:[#6]', 0),
556
+ 'PubChemFP702': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
557
+ 'PubChemFP703': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#7])-,:[#6]', 0),
558
+ 'PubChemFP704': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
559
+ 'PubChemFP705': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#8])-,:[#6]', 0),
560
+ 'PubChemFP706': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](=,:[#8])-,:[#6]', 0),
561
+ 'PubChemFP707': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#7])-,:[#6]', 0),
562
+ 'PubChemFP708': ('[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]', 0),
563
+ 'PubChemFP709': ('[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]-,:[#6]', 0),
564
+ 'PubChemFP710': ('[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]', 0),
565
+ 'PubChemFP711': ('[#6]-,:[#6](-,:[#6])(-,:[#6])-,:[#6]-,:[#6]', 0),
566
+ 'PubChemFP712': ('[#6]-,:[#6](-,:[#6])-,:[#6](-,:[#6])-,:[#6]', 0),
567
+ 'PubChemFP713': ('[#6]c1ccc([#6])cc1', 0),
568
+ 'PubChemFP714': ('[#6]c1ccc([#8])cc1', 0),
569
+ 'PubChemFP715': ('[#6]c1ccc([#16])cc1', 0),
570
+ 'PubChemFP716': ('[#6]c1ccc([#7])cc1', 0),
571
+ 'PubChemFP717': ('[#6]c1ccc(Cl)cc1', 0),
572
+ 'PubChemFP718': ('[#6]c1ccc(Br)cc1', 0),
573
+ 'PubChemFP719': ('[#8]c1ccc([#8])cc1', 0),
574
+ 'PubChemFP720': ('[#8]c1ccc([#16])cc1', 0),
575
+ 'PubChemFP721': ('[#8]c1ccc([#7])cc1', 0),
576
+ 'PubChemFP722': ('[#8]c1ccc(Cl)cc1', 0),
577
+ 'PubChemFP723': ('[#8]c1ccc(Br)cc1', 0),
578
+ 'PubChemFP724': ('[#16]c1ccc([#16])cc1', 0),
579
+ 'PubChemFP725': ('[#16]c1ccc([#7])cc1', 0),
580
+ 'PubChemFP726': ('[#16]c1ccc(Cl)cc1', 0),
581
+ 'PubChemFP727': ('[#16]c1ccc(Br)cc1', 0),
582
+ 'PubChemFP728': ('[#7]c1ccc([#7])cc1', 0),
583
+ 'PubChemFP729': ('[#7]c1ccc(Cl)cc1', 0),
584
+ 'PubChemFP730': ('[#7]c1ccc(Br)cc1', 0),
585
+ 'PubChemFP731': ('Clc1ccc(Cl)cc1', 0),
586
+ 'PubChemFP732': ('Clc1ccc(Br)cc1', 0),
587
+ 'PubChemFP733': ('Brc1ccc(Br)cc1', 0),
588
+ 'PubChemFP734': ('[#6]c1cc([#6])ccc1', 0),
589
+ 'PubChemFP735': ('[#6]c1cc([#8])ccc1', 0),
590
+ 'PubChemFP736': ('[#6]c1cc([#16])ccc1', 0),
591
+ 'PubChemFP737': ('[#6]c1cc([#7])ccc1', 0),
592
+ 'PubChemFP738': ('[#6]c1cc(Cl)ccc1', 0),
593
+ 'PubChemFP739': ('[#6]c1cc(Br)ccc1', 0),
594
+ 'PubChemFP740': ('[#8]c1cc([#8])ccc1', 0),
595
+ 'PubChemFP741': ('[#8]c1cc([#16])ccc1', 0),
596
+ 'PubChemFP742': ('[#8]c1cc([#7])ccc1', 0),
597
+ 'PubChemFP743': ('[#8]c1cc(Cl)ccc1', 0),
598
+ 'PubChemFP744': ('[#8]c1cc(Br)ccc1', 0),
599
+ 'PubChemFP745': ('[#16]c1cc([#16])ccc1', 0),
600
+ 'PubChemFP746': ('[#16]c1cc([#7])ccc1', 0),
601
+ 'PubChemFP747': ('[#16]c1cc(Cl)ccc1', 0),
602
+ 'PubChemFP748': ('[#16]c1cc(Br)ccc1', 0),
603
+ 'PubChemFP749': ('[#7]c1cc([#7])ccc1', 0),
604
+ 'PubChemFP750': ('[#7]c1cc(Cl)ccc1', 0),
605
+ 'PubChemFP751': ('[#7]c1cc(Br)ccc1', 0),
606
+ 'PubChemFP752': ('Clc1cc(Cl)ccc1', 0),
607
+ 'PubChemFP753': ('Clc1cc(Br)ccc1', 0),
608
+ 'PubChemFP754': ('Brc1cc(Br)ccc1', 0),
609
+ 'PubChemFP755': ('[#6]c1c([#6])cccc1', 0),
610
+ 'PubChemFP756': ('[#6]c1c([#8])cccc1', 0),
611
+ 'PubChemFP757': ('[#6]c1c([#16])cccc1', 0),
612
+ 'PubChemFP758': ('[#6]c1c([#7])cccc1', 0),
613
+ 'PubChemFP759': ('[#6]c1c(Cl)cccc1', 0),
614
+ 'PubChemFP760': ('[#6]c1c(Br)cccc1', 0),
615
+ 'PubChemFP761': ('[#8]c1c([#8])cccc1', 0),
616
+ 'PubChemFP762': ('[#8]c1c([#16])cccc1', 0),
617
+ 'PubChemFP763': ('[#8]c1c([#7])cccc1', 0),
618
+ 'PubChemFP764': ('[#8]c1c(Cl)cccc1', 0),
619
+ 'PubChemFP765': ('[#8]c1c(Br)cccc1', 0),
620
+ 'PubChemFP766': ('[#16]c1c([#16])cccc1', 0),
621
+ 'PubChemFP767': ('[#16]c1c([#7])cccc1', 0),
622
+ 'PubChemFP768': ('[#16]c1c(Cl)cccc1', 0),
623
+ 'PubChemFP769': ('[#16]c1c(Br)cccc1', 0),
624
+ 'PubChemFP770': ('[#7]c1c([#7])cccc1', 0),
625
+ 'PubChemFP771': ('[#7]c1c(Cl)cccc1', 0),
626
+ 'PubChemFP772': ('[#7]c1c(Br)cccc1', 0),
627
+ 'PubChemFP773': ('Clc1c(Cl)cccc1', 0),
628
+ 'PubChemFP774': ('Clc1c(Br)cccc1', 0),
629
+ 'PubChemFP775': ('Brc1c(Br)cccc1', 0),
630
+ 'PubChemFP776': ('[#6][#6]1[#6][#6][#6]([#6])[#6][#6]1', 0),
631
+ 'PubChemFP777': ('[#6][#6]1[#6][#6][#6]([#8])[#6][#6]1', 0),
632
+ 'PubChemFP778': ('[#6][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
633
+ 'PubChemFP779': ('[#6][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
634
+ 'PubChemFP780': ('[#6][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
635
+ 'PubChemFP781': ('[#6][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
636
+ 'PubChemFP782': ('[#8][#6]1[#6][#6][#6]([#8])[#6][#6]1', 0),
637
+ 'PubChemFP783': ('[#8][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
638
+ 'PubChemFP784': ('[#8][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
639
+ 'PubChemFP785': ('[#8][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
640
+ 'PubChemFP786': ('[#8][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
641
+ 'PubChemFP787': ('[#16][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
642
+ 'PubChemFP788': ('[#16][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
643
+ 'PubChemFP789': ('[#16][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
644
+ 'PubChemFP790': ('[#16][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
645
+ 'PubChemFP791': ('[#7][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
646
+ 'PubChemFP792': ('[#7][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
647
+ 'PubChemFP793': ('[#7][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
648
+ 'PubChemFP794': ('Cl[#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
649
+ 'PubChemFP795': ('Cl[#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
650
+ 'PubChemFP796': ('Br[#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
651
+ 'PubChemFP797': ('[#6][#6]1[#6][#6]([#6])[#6][#6][#6]1', 0),
652
+ 'PubChemFP798': ('[#6][#6]1[#6][#6]([#8])[#6][#6][#6]1', 0),
653
+ 'PubChemFP799': ('[#6][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
654
+ 'PubChemFP800': ('[#6][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
655
+ 'PubChemFP801': ('[#6][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
656
+ 'PubChemFP802': ('[#6][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
657
+ 'PubChemFP803': ('[#8][#6]1[#6][#6]([#8])[#6][#6][#6]1', 0),
658
+ 'PubChemFP804': ('[#8][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
659
+ 'PubChemFP805': ('[#8][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
660
+ 'PubChemFP806': ('[#8][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
661
+ 'PubChemFP807': ('[#8][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
662
+ 'PubChemFP808': ('[#16][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
663
+ 'PubChemFP809': ('[#16][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
664
+ 'PubChemFP810': ('[#16][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
665
+ 'PubChemFP811': ('[#16][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
666
+ 'PubChemFP812': ('[#7][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
667
+ 'PubChemFP813': ('[#7][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
668
+ 'PubChemFP814': ('[#7][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
669
+ 'PubChemFP815': ('Cl[#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
670
+ 'PubChemFP816': ('Cl[#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
671
+ 'PubChemFP817': ('Br[#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
672
+ 'PubChemFP818': ('[#6][#6]1[#6]([#6])[#6][#6][#6][#6]1', 0),
673
+ 'PubChemFP819': ('[#6][#6]1[#6]([#8])[#6][#6][#6][#6]1', 0),
674
+ 'PubChemFP820': ('[#6][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
675
+ 'PubChemFP821': ('[#6][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
676
+ 'PubChemFP822': ('[#6][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
677
+ 'PubChemFP823': ('[#6][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
678
+ 'PubChemFP824': ('[#8][#6]1[#6]([#8])[#6][#6][#6][#6]1', 0),
679
+ 'PubChemFP825': ('[#8][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
680
+ 'PubChemFP826': ('[#8][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
681
+ 'PubChemFP827': ('[#8][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
682
+ 'PubChemFP828': ('[#8][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
683
+ 'PubChemFP829': ('[#16][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
684
+ 'PubChemFP830': ('[#16][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
685
+ 'PubChemFP831': ('[#16][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
686
+ 'PubChemFP832': ('[#16][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
687
+ 'PubChemFP833': ('[#7][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
688
+ 'PubChemFP834': ('[#7][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
689
+ 'PubChemFP835': ('[#7][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
690
+ 'PubChemFP836': ('Cl[#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
691
+ 'PubChemFP837': ('Cl[#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
692
+ 'PubChemFP838': ('Br[#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
693
+ 'PubChemFP839': ('[#6][#6]1[#6][#6]([#6])[#6][#6]1', 0),
694
+ 'PubChemFP840': ('[#6][#6]1[#6][#6]([#8])[#6][#6]1', 0),
695
+ 'PubChemFP841': ('[#6][#6]1[#6][#6]([#16])[#6][#6]1', 0),
696
+ 'PubChemFP842': ('[#6][#6]1[#6][#6]([#7])[#6][#6]1', 0),
697
+ 'PubChemFP843': ('[#6][#6]1[#6][#6](Cl)[#6][#6]1', 0),
698
+ 'PubChemFP844': ('[#6][#6]1[#6][#6](Br)[#6][#6]1', 0),
699
+ 'PubChemFP845': ('[#8][#6]1[#6][#6]([#8])[#6][#6]1', 0),
700
+ 'PubChemFP846': ('[#8][#6]1[#6][#6]([#16])[#6][#6]1', 0),
701
+ 'PubChemFP847': ('[#8][#6]1[#6][#6]([#7])[#6][#6]1', 0),
702
+ 'PubChemFP848': ('[#8][#6]1[#6][#6](Cl)[#6][#6]1', 0),
703
+ 'PubChemFP849': ('[#8][#6]1[#6][#6](Br)[#6][#6]1', 0),
704
+ 'PubChemFP850': ('[#16][#6]1[#6][#6]([#16])[#6][#6]1', 0),
705
+ 'PubChemFP851': ('[#16][#6]1[#6][#6]([#7])[#6][#6]1', 0),
706
+ 'PubChemFP852': ('[#16][#6]1[#6][#6](Cl)[#6][#6]1', 0),
707
+ 'PubChemFP853': ('[#16][#6]1[#6][#6](Br)[#6][#6]1', 0),
708
+ 'PubChemFP854': ('[#7][#6]1[#6][#6]([#7])[#6][#6]1', 0),
709
+ 'PubChemFP855': ('[#7][#6]1[#6][#6](Cl)[#6][#6]1', 0),
710
+ 'PubChemFP856': ('[#7][#6]1[#6][#6](Br)[#6][#6]1', 0),
711
+ 'PubChemFP857': ('Cl[#6]1[#6][#6](Cl)[#6][#6]1', 0),
712
+ 'PubChemFP858': ('Cl[#6]1[#6][#6](Br)[#6][#6]1', 0),
713
+ 'PubChemFP859': ('Br[#6]1[#6][#6](Br)[#6][#6]1', 0),
714
+ 'PubChemFP860': ('[#6][#6]1[#6]([#6])[#6][#6][#6]1', 0),
715
+ 'PubChemFP861': ('[#6][#6]1[#6]([#8])[#6][#6][#6]1', 0),
716
+ 'PubChemFP862': ('[#6][#6]1[#6]([#16])[#6][#6][#6]1', 0),
717
+ 'PubChemFP863': ('[#6][#6]1[#6]([#7])[#6][#6][#6]1', 0),
718
+ 'PubChemFP864': ('[#6][#6]1[#6](Cl)[#6][#6][#6]1', 0),
719
+ 'PubChemFP865': ('[#6][#6]1[#6](Br)[#6][#6][#6]1', 0),
720
+ 'PubChemFP866': ('[#8][#6]1[#6]([#8])[#6][#6][#6]1', 0),
721
+ 'PubChemFP867': ('[#8][#6]1[#6]([#16])[#6][#6][#6]1', 0),
722
+ 'PubChemFP868': ('[#8][#6]1[#6]([#7])[#6][#6][#6]1', 0),
723
+ 'PubChemFP869': ('[#8][#6]1[#6](Cl)[#6][#6][#6]1', 0),
724
+ 'PubChemFP870': ('[#8][#6]1[#6](Br)[#6][#6][#6]1', 0),
725
+ 'PubChemFP871': ('[#16][#6]1[#6]([#16])[#6][#6][#6]1', 0),
726
+ 'PubChemFP872': ('[#16][#6]1[#6]([#7])[#6][#6][#6]1', 0),
727
+ 'PubChemFP873': ('[#16][#6]1[#6](Cl)[#6][#6][#6]1', 0),
728
+ 'PubChemFP874': ('[#16][#6]1[#6](Br)[#6][#6][#6]1', 0),
729
+ 'PubChemFP875': ('[#7][#6]1[#6]([#7])[#6][#6][#6]1', 0),
730
+ 'PubChemFP876': ('[#7][#6]1[#6](Cl)[#6][#6]1', 0),
731
+ 'PubChemFP877': ('[#7][#6]1[#6](Br)[#6][#6][#6]1', 0),
732
+ 'PubChemFP878': ('Cl[#6]1[#6](Cl)[#6][#6][#6]1', 0),
733
+ 'PubChemFP879': ('Cl[#6]1[#6](Br)[#6][#6][#6]1', 0),
734
+ 'PubChemFP880': ('Br[#6]1[#6](Br)[#6][#6][#6]1', 0)}
deepscreen/data/featurizers/fingerprint/torsions.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rdkit.Chem.AtomPairs import Torsions
2
+ from rdkit.Chem import DataStructs
3
+ import numpy as np
4
+
5
+ _type = 'topological-based'
6
+
7
+
8
+ def GetTorsionFPs(mol, nBits=2048, binary=True):
9
+ '''
10
+ atompairs fingerprints
11
+ '''
12
+ fp = Torsions.GetHashedTopologicalTorsionFingerprint(mol, nBits=nBits)
13
+ if binary:
14
+ arr = np.zeros((0,), dtype=np.bool_)
15
+ else:
16
+ arr = np.zeros((0,), dtype=np.int8)
17
+ DataStructs.ConvertToNumpyArray(fp, arr)
18
+ return arr
deepscreen/data/featurizers/graph.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import networkx as nx
2
+ import numpy as np
3
+ import torch
4
+ from rdkit import Chem
5
+ from torch_geometric.utils import from_smiles
6
+ from torch_geometric.data import Data
7
+
8
+ from deepscreen.data.featurizers.categorical import one_of_k_encoding_unk, one_of_k_encoding
9
+ from deepscreen.utils import get_logger
10
+
11
+ log = get_logger(__name__)
12
+
13
+
14
+ def atom_features(atom, explicit_H=False, use_chirality=True):
15
+ """
16
+ Adapted from TransformerCPI 2.0
17
+ """
18
+ symbol = ['C', 'N', 'O', 'F', 'P', 'S', 'Cl', 'Br', 'I', 'other'] # 10-dim
19
+ degree = [0, 1, 2, 3, 4, 5, 6] # 7-dim
20
+ hybridization_type = [Chem.rdchem.HybridizationType.SP,
21
+ Chem.rdchem.HybridizationType.SP2,
22
+ Chem.rdchem.HybridizationType.SP3,
23
+ Chem.rdchem.HybridizationType.SP3D,
24
+ Chem.rdchem.HybridizationType.SP3D2,
25
+ 'other'] # 6-dim
26
+
27
+ # 10+7+2+6+1=26
28
+ results = one_of_k_encoding_unk(atom.GetSymbol(), symbol) + \
29
+ one_of_k_encoding(atom.GetDegree(), degree) + \
30
+ [atom.GetFormalCharge(), atom.GetNumRadicalElectrons()] + \
31
+ one_of_k_encoding_unk(atom.GetHybridization(), hybridization_type) + [atom.GetIsAromatic()]
32
+
33
+ # In case of explicit hydrogen(QM8, QM9), avoid calling `GetTotalNumHs`
34
+ # 26+5=31
35
+ if not explicit_H:
36
+ results = results + one_of_k_encoding_unk(atom.GetTotalNumHs(),
37
+ [0, 1, 2, 3, 4])
38
+ # 31+3=34
39
+ if use_chirality:
40
+ try:
41
+ results = results + one_of_k_encoding_unk(
42
+ atom.GetProp('_CIPCode'),
43
+ ['R', 'S']) + [atom.HasProp('_ChiralityPossible')]
44
+ except:
45
+ results = results + [False, False] + [atom.HasProp('_ChiralityPossible')]
46
+
47
+ return np.array(results)
48
+
49
+
50
+ def bond_features(bond):
51
+ bt = bond.GetBondType()
52
+ return np.array(
53
+ [bt == Chem.rdchem.BondType.SINGLE, bt == Chem.rdchem.BondType.DOUBLE, bt == Chem.rdchem.BondType.TRIPLE,
54
+ bt == Chem.rdchem.BondType.AROMATIC, bond.GetIsConjugated(), bond.IsInRing()])
55
+
56
+
57
+ def smiles_to_graph_pyg(smiles):
58
+ """
59
+ Convert SMILES to graph with the default method defined by PyTorch Geometric
60
+ """
61
+ try:
62
+ return from_smiles(smiles)
63
+ except Exception as e:
64
+ log.warning(f"Failed to featurize the following SMILES to graph: {smiles} due to {str(e)}")
65
+ return None
66
+
67
+
68
+ def smiles_to_graph(smiles, atom_features: callable = atom_features):
69
+ """
70
+ Convert SMILES to graph with custom atom_features
71
+ """
72
+ try:
73
+ mol = Chem.MolFromSmiles(smiles)
74
+
75
+ features = []
76
+ for atom in mol.GetAtoms():
77
+ feature = atom_features(atom)
78
+ features.append(feature / sum(feature))
79
+ features = np.array(features)
80
+
81
+ edges = []
82
+ for bond in mol.GetBonds():
83
+ edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])
84
+ g = nx.Graph(edges).to_directed()
85
+
86
+ if len(edges) == 0:
87
+ edge_index = [[0, 0]]
88
+ else:
89
+ edge_index = []
90
+ for e1, e2 in g.edges:
91
+ edge_index.append([e1, e2])
92
+
93
+ return Data(x=torch.Tensor(features),
94
+ edge_index=torch.LongTensor(edge_index).transpose(0, 1))
95
+
96
+ except Exception as e:
97
+ log.warning(f"Failed to convert SMILES ({smiles}) to graph due to {str(e)}")
98
+ return None
99
+ # features = []
100
+ # for atom in mol.GetAtoms():
101
+ # feature = atom_features(atom)
102
+ # features.append(feature / sum(feature))
103
+ #
104
+ # edge_indices = []
105
+ # for bond in mol.GetBonds():
106
+ # i = bond.GetBeginAtomIdx()
107
+ # j = bond.GetEndAtomIdx()
108
+ # edge_indices += [[i, j], [j, i]]
109
+ #
110
+ # edge_index = torch.tensor(edge_indices)
111
+ # edge_index = edge_index.t().to(torch.long).view(2, -1)
112
+ #
113
+ # if edge_index.numel() > 0: # Sort indices.
114
+ # perm = (edge_index[0] * x.size(0) + edge_index[1]).argsort()
115
+ # edge_index = edge_index[:, perm]
116
+ #
117
+
118
+
119
+ def smiles_to_mol_features(smiles, num_atom_feat: callable):
120
+ try:
121
+ mol = Chem.MolFromSmiles(smiles)
122
+ num_atom_feat = len(atom_features(mol.GetAtoms()[0]))
123
+ atom_feat = np.zeros((mol.GetNumAtoms(), num_atom_feat))
124
+ for atom in mol.GetAtoms():
125
+ atom_feat[atom.GetIdx(), :] = atom_features(atom)
126
+ adj = Chem.GetAdjacencyMatrix(mol)
127
+ adj_mat = np.array(adj)
128
+
129
+ return atom_feat, adj_mat
130
+
131
+ except Exception as e:
132
+ log.warning(f"Failed to featurize the following SMILES to molecular features: {smiles} due to {str(e)}")
133
+ return None
deepscreen/data/featurizers/monn.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from rdkit.Chem import MolFromSmiles
3
+
4
+ from deepscreen.data.featurizers.categorical import FASTA_VOCAB, fasta_to_label
5
+ from deepscreen.data.featurizers.graph import atom_features, bond_features
6
+
7
+
8
+ def get_mask(arr):
9
+ a = np.zeros(1, len(arr))
10
+ a[1, :arr.shape[0]] = 1
11
+ return a
12
+
13
+
14
+ def add_index(input_array, ebd_size):
15
+ batch_size, n_vertex, n_nbs = np.shape(input_array)
16
+ add_idx = np.array(range(0, ebd_size * batch_size, ebd_size) * (n_nbs * n_vertex))
17
+ add_idx = np.transpose(add_idx.reshape(-1, batch_size))
18
+ add_idx = add_idx.reshape(-1)
19
+ new_array = input_array.reshape(-1) + add_idx
20
+ return new_array
21
+
22
+
23
+ # TODO fix padding and masking
24
+ def drug_featurizer(smiles, max_neighbors=6):
25
+ mol = MolFromSmiles(smiles)
26
+
27
+ # convert molecule to GNN input
28
+ n_atoms = mol.GetNumAtoms()
29
+ assert mol.GetNumBonds() >= 0
30
+
31
+ n_bonds = max(mol.GetNumBonds(), 1)
32
+ feat_atoms = np.zeros((n_atoms,)) # atom feature ID
33
+ feat_bonds = np.zeros((n_bonds,)) # bond feature ID
34
+ atom_adj = np.zeros((n_atoms, max_neighbors))
35
+ bond_adj = np.zeros((n_atoms, max_neighbors))
36
+ n_neighbors = np.zeros((n_atoms,))
37
+ neighbor_mask = np.zeros((n_atoms, max_neighbors))
38
+
39
+ for atom in mol.GetAtoms():
40
+ idx = atom.GetIdx()
41
+ feat_atoms[idx] = atom_features(atom)
42
+
43
+ for bond in mol.GetBonds():
44
+ a1 = bond.GetBeginAtom().GetIdx()
45
+ a2 = bond.GetEndAtom().GetIdx()
46
+ idx = bond.GetIdx()
47
+ feat_bonds[idx] = bond_features(bond)
48
+ try:
49
+ atom_adj[a1, n_neighbors[a1]] = a2
50
+ atom_adj[a2, n_neighbors[a2]] = a1
51
+ except:
52
+ return [], [], [], [], []
53
+ bond_adj[a1, n_neighbors[a1]] = idx
54
+ bond_adj[a2, n_neighbors[a2]] = idx
55
+ n_neighbors[a1] += 1
56
+ n_neighbors[a2] += 1
57
+
58
+ for i in range(len(n_neighbors)):
59
+ neighbor_mask[i, :n_neighbors[i]] = 1
60
+
61
+ vertex_mask = get_mask(feat_atoms)
62
+ # vertex = pack_1d(feat_atoms)
63
+ # edge = pack_1d(feat_bonds)
64
+ # atom_adj = pack_2d(atom_adj)
65
+ # bond_adj = pack_2d(bond_adj)
66
+ # nbs_mask = pack_2d(n_neighbors_mat)
67
+
68
+ atom_adj = add_index(atom_adj, np.shape(atom_adj)[1])
69
+ bond_adj = add_index(bond_adj, np.shape(feat_bonds)[1])
70
+
71
+ return vertex_mask, feat_atoms, feat_bonds, atom_adj, bond_adj, neighbor_mask
72
+
73
+
74
+ # TODO WIP the pairwise_label matrix probably should be generated beforehand and stored as an extra label in the dataset
75
+ def get_pairwise_label(pdbid, interaction_dict, mol):
76
+ if pdbid in interaction_dict:
77
+ sdf_element = np.array([atom.GetSymbol().upper() for atom in mol.GetAtoms()])
78
+ atom_element = np.array(interaction_dict[pdbid]['atom_element'], dtype=str)
79
+ atom_name_list = np.array(interaction_dict[pdbid]['atom_name'], dtype=str)
80
+ atom_interact = np.array(interaction_dict[pdbid]['atom_interact'], dtype=int)
81
+ nonH_position = np.where(atom_element != 'H')[0]
82
+ assert sum(atom_element[nonH_position] != sdf_element) == 0
83
+
84
+ atom_name_list = atom_name_list[nonH_position].tolist()
85
+ pairwise_mat = np.zeros((len(nonH_position), len(interaction_dict[pdbid]['uniprot_seq'])), dtype=np.int32)
86
+ for atom_name, bond_type in interaction_dict[pdbid]['atom_bond_type']:
87
+ atom_idx = atom_name_list.index(str(atom_name))
88
+ assert atom_idx < len(nonH_position)
89
+
90
+ seq_idx_list = []
91
+ for seq_idx, bond_type_seq in interaction_dict[pdbid]['residue_bond_type']:
92
+ if bond_type == bond_type_seq:
93
+ seq_idx_list.append(seq_idx)
94
+ pairwise_mat[atom_idx, seq_idx] = 1
95
+ if len(np.where(pairwise_mat != 0)[0]) != 0:
96
+ pairwise_mask = True
97
+ return True, pairwise_mat
98
+ return False, np.zeros((1, 1))
99
+
100
+
101
+ def protein_featurizer(fasta):
102
+ sequence = fasta_to_label(fasta)
103
+ # pad proteins and make masks
104
+ seq_mask = get_mask(sequence)
105
+
106
+ return seq_mask, sequence
deepscreen/data/featurizers/token.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import collections
2
+ from importlib import resources
3
+ import os
4
+ import re
5
+ from typing import Optional, List
6
+
7
+ import numpy as np
8
+ from transformers import BertTokenizer
9
+
10
+ SMI_REGEX_PATTERN = r"""(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"""
11
+ # \[[^\]]+\] # match anything inside square brackets
12
+ # |Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p # match elements
13
+ # |\(|\) # match parentheses
14
+ # |\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2} # match various symbols
15
+ # |[0-9] # match digits
16
+
17
+
18
+ def sequence_to_kmers(sequence, k=3):
19
+ """ Divide a string into a list of kmers strings.
20
+
21
+ Parameters:
22
+ sequence (string)
23
+ k (int), default 3
24
+ Returns:
25
+ List containing a list of kmers.
26
+ """
27
+ return [sequence[i:i + k] for i in range(len(sequence) - k + 1)]
28
+
29
+
30
+ def sequence_to_word_embedding(sequence, model):
31
+ """Get protein embedding, infer a list of 3-mers to (num_word, 100) matrix"""
32
+ kmers = sequence_to_kmers(sequence)
33
+ vec = np.zeros((len(kmers), 100))
34
+ i = 0
35
+ for word in kmers:
36
+ try:
37
+ vec[i,] = model.wv[word]
38
+ except KeyError:
39
+ pass
40
+ i += 1
41
+ return vec
42
+
43
+
44
+ def sequence_to_token_ids(sequence, tokenizer):
45
+ token_ids = tokenizer.encode(sequence)
46
+ return np.array(token_ids)
47
+
48
+
49
+ # def sequence_to_token_ids(sequence, tokenizer, max_length: int):
50
+ # token_ids = tokenizer.encode(sequence)
51
+ # length = min(max_length, len(token_ids))
52
+ #
53
+ # token_ids_padded = np.zeros(max_length, dtype='int')
54
+ # token_ids_padded[:length] = token_ids[:length]
55
+ #
56
+ # return token_ids_padded
57
+
58
+
59
+ class SmilesTokenizer(BertTokenizer):
60
+ """
61
+ Adapted from https://github.com/deepchem/deepchem/.
62
+
63
+ Creates the SmilesTokenizer class. The tokenizer heavily inherits from the BertTokenizer
64
+ implementation found in Huggingface's transformers library. It runs a WordPiece tokenization
65
+ algorithm over SMILES strings using the tokenization SMILES regex developed by Schwaller et al.
66
+
67
+ Please see https://github.com/huggingface/transformers
68
+ and https://github.com/rxn4chemistry/rxnfp for more details.
69
+
70
+ Examples
71
+ --------
72
+ >>> tokenizer = SmilesTokenizer(vocab_path, regex_pattern)
73
+ >>> print(tokenizer.encode("CC(=O)OC1=CC=CC=C1C(=O)O"))
74
+ [12, 16, 16, 17, 22, 19, 18, 19, 16, 20, 22, 16, 16, 22, 16, 16, 22, 16, 20, 16, 17, 22, 19, 18, 19, 13]
75
+
76
+
77
+ References
78
+ ----------
79
+ .. [1] Schwaller, Philippe; Probst, Daniel; Vaucher, Alain C.; Nair, Vishnu H; Kreutter, David;
80
+ Laino, Teodoro; et al. (2019): Mapping the Space of Chemical Reactions using Attention-Based Neural
81
+ Networks. ChemRxiv. Preprint. https://doi.org/10.26434/chemrxiv.9897365.v3
82
+
83
+ Note
84
+ ----
85
+ This class requires huggingface's transformers and tokenizers libraries to be installed.
86
+ """
87
+
88
+ def __init__(
89
+ self,
90
+ vocab_file: str = 'resources/vocabs/smiles.txt',
91
+ regex_pattern: str = SMI_REGEX_PATTERN,
92
+ # unk_token="[UNK]",
93
+ # sep_token="[SEP]",
94
+ # pad_token="[PAD]",
95
+ # cls_token="[CLS]",
96
+ # mask_token="[MASK]",
97
+ **kwargs):
98
+ """Constructs a SmilesTokenizer.
99
+
100
+ Parameters
101
+ ----------
102
+ vocab_file: str
103
+ Path to a SMILES character per line vocabulary file.
104
+ Default vocab file is found in deepchem/feat/tests/data/vocab.txt
105
+ """
106
+
107
+ super().__init__(vocab_file, **kwargs)
108
+
109
+ if not os.path.isfile(vocab_file):
110
+ raise ValueError(
111
+ "Can't find a vocab file at path '{}'.".format(vocab_file))
112
+ self.vocab = load_vocab(vocab_file)
113
+ unused_indexes = [i for i, v in enumerate(self.vocab.keys()) if v.startswith("[unused")]
114
+ self.highest_unused_index = 0 if len(unused_indexes) == 0 else max(unused_indexes)
115
+ self.ids_to_tokens = collections.OrderedDict([
116
+ (ids, tok) for tok, ids in self.vocab.items()
117
+ ])
118
+ self.basic_tokenizer = BasicSmilesTokenizer(regex_pattern=regex_pattern)
119
+
120
+ @property
121
+ def vocab_size(self):
122
+ return len(self.vocab)
123
+
124
+ @property
125
+ def vocab_list(self):
126
+ return list(self.vocab.keys())
127
+
128
+ def _tokenize(self, text: str, max_seq_length: int = 512, **kwargs):
129
+ """Tokenize a string into a list of tokens.
130
+
131
+ Parameters
132
+ ----------
133
+ text: str
134
+ Input string sequence to be tokenized.
135
+ """
136
+
137
+ max_len_single_sentence = max_seq_length - 2
138
+ split_tokens = [
139
+ token for token in self.basic_tokenizer.tokenize(text)
140
+ [:max_len_single_sentence]
141
+ ]
142
+ return split_tokens
143
+
144
+ def _convert_token_to_id(self, token: str):
145
+ """Converts a token (str/unicode) in an id using the vocab.
146
+
147
+ Parameters
148
+ ----------
149
+ token: str
150
+ String token from a larger sequence to be converted to a numerical id.
151
+ """
152
+
153
+ return self.vocab.get(token, self.vocab.get(self.unk_token))
154
+
155
+ def _convert_id_to_token(self, index: int):
156
+ """Converts an index (integer) in a token (string/unicode) using the vocab.
157
+
158
+ Parameters
159
+ ----------
160
+ index: int
161
+ Integer index to be converted back to a string-based token as part of a larger sequence.
162
+ """
163
+
164
+ return self.ids_to_tokens.get(index, self.unk_token)
165
+
166
+ def convert_tokens_to_string(self, tokens: List[str]):
167
+ """Converts a sequence of tokens (string) in a single string.
168
+
169
+ Parameters
170
+ ----------
171
+ tokens: List[str]
172
+ List of tokens for a given string sequence.
173
+
174
+ Returns
175
+ -------
176
+ out_string: str
177
+ Single string from combined tokens.
178
+ """
179
+
180
+ out_string: str = " ".join(tokens).replace(" ##", "").strip()
181
+ return out_string
182
+
183
+ def add_special_tokens_ids_single_sequence(self,
184
+ token_ids: List[Optional[int]]):
185
+ """Adds special tokens to a sequence for sequence classification tasks.
186
+
187
+ A BERT sequence has the following format: [CLS] X [SEP]
188
+
189
+ Parameters
190
+ ----------
191
+ token_ids: list[int]
192
+ list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
193
+ """
194
+
195
+ return [self.cls_token_id] + token_ids + [self.sep_token_id]
196
+
197
+ def add_special_tokens_single_sequence(self, tokens: List[str]):
198
+ """Adds special tokens to the a sequence for sequence classification tasks.
199
+ A BERT sequence has the following format: [CLS] X [SEP]
200
+
201
+ Parameters
202
+ ----------
203
+ tokens: List[str]
204
+ List of tokens for a given string sequence.
205
+ """
206
+ return [self.cls_token] + tokens + [self.sep_token]
207
+
208
+ def add_special_tokens_ids_sequence_pair(
209
+ self, token_ids_0: List[Optional[int]],
210
+ token_ids_1: List[Optional[int]]) -> List[Optional[int]]:
211
+ """Adds special tokens to a sequence pair for sequence classification tasks.
212
+ A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
213
+
214
+ Parameters
215
+ ----------
216
+ token_ids_0: List[int]
217
+ List of ids for the first string sequence in the sequence pair (A).
218
+ token_ids_1: List[int]
219
+ List of tokens for the second string sequence in the sequence pair (B).
220
+ """
221
+
222
+ sep = [self.sep_token_id]
223
+ cls = [self.cls_token_id]
224
+
225
+ return cls + token_ids_0 + sep + token_ids_1 + sep
226
+
227
+ def add_padding_tokens(self,
228
+ token_ids: List[Optional[int]],
229
+ length: int,
230
+ right: bool = True) -> List[Optional[int]]:
231
+ """Adds padding tokens to return a sequence of length max_length.
232
+ By default padding tokens are added to the right of the sequence.
233
+
234
+ Parameters
235
+ ----------
236
+ token_ids: list[optional[int]]
237
+ list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
238
+ length: int
239
+ right: bool, default True
240
+
241
+ Returns
242
+ -------
243
+ List[int]
244
+ """
245
+ padding = [self.pad_token_id] * (length - len(token_ids))
246
+
247
+ if right:
248
+ return token_ids + padding
249
+ else:
250
+ return padding + token_ids
251
+
252
+
253
+ class BasicSmilesTokenizer(object):
254
+ """
255
+ Adapted from https://github.com/deepchem/deepchem/.
256
+ Run basic SMILES tokenization using a regex pattern developed by Schwaller et. al.
257
+ This tokenizer is to be used when a tokenizer that does not require the transformers library by HuggingFace is required.
258
+
259
+ Examples
260
+ --------
261
+ >>> tokenizer = BasicSmilesTokenizer()
262
+ >>> print(tokenizer.tokenize("CC(=O)OC1=CC=CC=C1C(=O)O"))
263
+ ['C', 'C', '(', '=', 'O', ')', 'O', 'C', '1', '=', 'C', 'C', '=', 'C', 'C', '=', 'C', '1', 'C', '(', '=', 'O', ')', 'O']
264
+
265
+
266
+ References
267
+ ----------
268
+ .. [1] Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee
269
+ ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction
270
+ 1572-1583 DOI: 10.1021/acscentsci.9b00576
271
+ """
272
+
273
+ def __init__(self, regex_pattern: str = SMI_REGEX_PATTERN):
274
+ """Constructs a BasicSMILESTokenizer.
275
+
276
+ Parameters
277
+ ----------
278
+ regex: string
279
+ SMILES token regex
280
+ """
281
+ self.regex_pattern = regex_pattern
282
+ self.regex = re.compile(self.regex_pattern)
283
+
284
+ def tokenize(self, text):
285
+ """Basic Tokenization of a SMILES.
286
+ """
287
+ tokens = [token for token in self.regex.findall(text)]
288
+ return tokens
289
+
290
+
291
+ def load_vocab(vocab_file):
292
+ """Loads a vocabulary file into a dictionary."""
293
+ vocab = collections.OrderedDict()
294
+ with open(vocab_file, "r", encoding="utf-8") as reader:
295
+ tokens = reader.readlines()
296
+ for index, token in enumerate(tokens):
297
+ token = token.rstrip("\n")
298
+ vocab[token] = index
299
+ return vocab
deepscreen/data/single_entity.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from itertools import product
2
+ from numbers import Number
3
+ from pathlib import Path
4
+ from typing import Any, Dict, Optional, Sequence, Union, Literal
5
+
6
+ # import numpy as np
7
+ import pandas as pd
8
+ from lightning import LightningDataModule
9
+ from sklearn.base import TransformerMixin
10
+ from torch.utils.data import Dataset, DataLoader, random_split
11
+
12
+ from deepscreen.data.utils.dataset import SingleEntitySingleTargetDataset, BaseEntityDataset
13
+ from deepscreen.data.utils.label import label_transform
14
+ from deepscreen.data.utils.collator import collate_fn
15
+ from deepscreen.data.utils.sampler import SafeBatchSampler
16
+
17
+
18
+ class EntityDataModule(LightningDataModule):
19
+ """
20
+ DTI DataModule
21
+
22
+ A DataModule implements 5 key methods:
23
+
24
+ def prepare_data(self):
25
+ # things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
26
+ # download data, pre-process, split, save to disk, etc.
27
+ def setup(self, stage):
28
+ # things to do on every process in DDP
29
+ # load data, set variables, etc.
30
+ def train_dataloader(self):
31
+ # return train dataloader
32
+ def val_dataloader(self):
33
+ # return validation dataloader
34
+ def test_dataloader(self):
35
+ # return test dataloader
36
+ def teardown(self):
37
+ # called on every process in DDP
38
+ # clean up after fit or test
39
+
40
+ This allows you to share a full dataset without explaining how to download,
41
+ split, transform and process the data.
42
+
43
+ Read the docs:
44
+ https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ dataset: type[BaseEntityDataset],
50
+ task: Literal['regression', 'binary', 'multiclass'],
51
+ n_classes: Optional[int],
52
+ train: bool,
53
+ batch_size: int,
54
+ num_workers: int = 0,
55
+ thresholds: Optional[Union[Number, Sequence[Number]]] = None,
56
+ pin_memory: bool = False,
57
+ data_dir: str = "data/",
58
+ data_file: Optional[str] = None,
59
+ train_val_test_split: Optional[Sequence[Number], Sequence[str]] = None,
60
+ split: Optional[callable] = random_split,
61
+ ):
62
+ super().__init__()
63
+ data_path = Path(data_dir) / data_file
64
+ # this line allows to access init params with 'self.hparams' attribute
65
+ # also ensures init params will be stored in ckpt
66
+ self.save_hyperparameters(logger=False)
67
+
68
+ # data processing
69
+ self.split = split
70
+
71
+ if train:
72
+ if all([data_file, split]):
73
+ if all(isinstance(split, Number) for split in train_val_test_split):
74
+ pass
75
+ else:
76
+ raise ValueError('`train_val_test_split` must be a sequence of 3 numbers '
77
+ '(float for percentages and int for sample numbers) if '
78
+ '`data_file` and `split` have been specified.')
79
+ elif all(isinstance(split, str) for split in train_val_test_split) and not any([data_file, split]):
80
+ self.train_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[0]))
81
+ self.val_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[1]))
82
+ self.test_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[2]))
83
+ else:
84
+ raise ValueError('For training (train=True), you must specify either '
85
+ '`dataset_name` and `split` with `train_val_test_split` of 3 numbers or '
86
+ 'solely `train_val_test_split` of 3 data file names.')
87
+ else:
88
+ if data_file and not any([split, train_val_test_split]):
89
+ self.test_data = self.predict_data = dataset(dataset_path=str(Path(data_dir) / data_file))
90
+ else:
91
+ raise ValueError("For testing/predicting (train=False), you must specify only `data_file` without "
92
+ "`train_val_test_split` or `split`")
93
+
94
+ def prepare_data(self):
95
+ """
96
+ Download data if needed.
97
+ Do not use it to assign state (e.g., self.x = x).
98
+ """
99
+
100
+ def setup(self, stage: Optional[str] = None, encoding: str = None):
101
+ """
102
+ Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
103
+ This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
104
+ careful not to execute data splitting twice.
105
+ """
106
+ # load and split datasets only if not loaded in initialization
107
+ if not any([self.data_train, self.data_val, self.data_test, self.data_predict]):
108
+ dataset = SingleEntitySingleTargetDataset(
109
+ task=self.hparams.task,
110
+ n_classes=self.hparams.n_classes,
111
+ dataset_path=Path(self.hparams.data_dir) / self.hparams.dataset_name,
112
+ transformer=self.hparams.transformer,
113
+ featurizer=self.hparams.featurizer,
114
+ thresholds=self.hparams.thresholds,
115
+ )
116
+
117
+ if self.hparams.train:
118
+ self.data_train, self.data_val, self.data_test = self.split(
119
+ dataset=dataset,
120
+ lengths=self.hparams.train_val_test_split
121
+ )
122
+ else:
123
+ self.data_test = self.data_predict = dataset
124
+
125
+ def train_dataloader(self):
126
+ return DataLoader(
127
+ dataset=self.data_train,
128
+ batch_sampler=SafeBatchSampler(
129
+ data_source=self.data_train,
130
+ batch_size=self.hparams.batch_size,
131
+ shuffle=True),
132
+ # batch_size=self.hparams.batch_size,
133
+ # shuffle=True,
134
+ num_workers=self.hparams.num_workers,
135
+ pin_memory=self.hparams.pin_memory,
136
+ collate_fn=collate_fn,
137
+ persistent_workers=True if self.hparams.num_workers > 0 else False
138
+ )
139
+
140
+ def val_dataloader(self):
141
+ return DataLoader(
142
+ dataset=self.data_val,
143
+ batch_sampler=SafeBatchSampler(
144
+ data_source=self.data_val,
145
+ batch_size=self.hparams.batch_size,
146
+ shuffle=False),
147
+ # batch_size=self.hparams.batch_size,
148
+ # shuffle=False,
149
+ num_workers=self.hparams.num_workers,
150
+ pin_memory=self.hparams.pin_memory,
151
+ collate_fn=collate_fn,
152
+ persistent_workers=True if self.hparams.num_workers > 0 else False
153
+ )
154
+
155
+ def test_dataloader(self):
156
+ return DataLoader(
157
+ dataset=self.data_test,
158
+ batch_sampler=SafeBatchSampler(
159
+ data_source=self.data_test,
160
+ batch_size=self.hparams.batch_size,
161
+ shuffle=False),
162
+ # batch_size=self.hparams.batch_size,
163
+ # shuffle=False,
164
+ num_workers=self.hparams.num_workers,
165
+ pin_memory=self.hparams.pin_memory,
166
+ collate_fn=collate_fn,
167
+ persistent_workers=True if self.hparams.num_workers > 0 else False
168
+ )
169
+
170
+ def predict_dataloader(self):
171
+ return DataLoader(
172
+ dataset=self.data_predict,
173
+ batch_sampler=SafeBatchSampler(
174
+ data_source=self.data_predict,
175
+ batch_size=self.hparams.batch_size,
176
+ shuffle=False),
177
+ # batch_size=self.hparams.batch_size,
178
+ # shuffle=False,
179
+ num_workers=self.hparams.num_workers,
180
+ pin_memory=self.hparams.pin_memory,
181
+ collate_fn=collate_fn,
182
+ persistent_workers=True if self.hparams.num_workers > 0 else False
183
+ )
184
+
185
+ def teardown(self, stage: Optional[str] = None):
186
+ """Clean up after fit or test."""
187
+ pass
188
+
189
+ def state_dict(self):
190
+ """Extra things to save to checkpoint."""
191
+ return {}
192
+
193
+ def load_state_dict(self, state_dict: Dict[str, Any]):
194
+ """Things to do when loading checkpoint."""
195
+ pass
deepscreen/data/utils/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Sequence, TypeVar, Union
2
+
3
+ from deepscreen.data.utils.collator import collate_fn
4
+ from deepscreen.data.utils.label import label_transform
5
+ from deepscreen.data.utils.sampler import SafeBatchSampler
6
+
7
+ T = TypeVar('T')
8
+ FlexibleIterable = Union[T, Sequence[T], Dict[str, T]]
deepscreen/data/utils/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (701 Bytes). View file
 
deepscreen/data/utils/__pycache__/collator.cpython-311.pyc ADDED
Binary file (4.97 kB). View file
 
deepscreen/data/utils/__pycache__/label.cpython-311.pyc ADDED
Binary file (4.88 kB). View file
 
deepscreen/data/utils/__pycache__/sampler.cpython-311.pyc ADDED
Binary file (3.56 kB). View file
 
deepscreen/data/utils/__pycache__/split.cpython-311.pyc ADDED
Binary file (5.68 kB). View file
 
deepscreen/data/utils/collator.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Define collate functions for new data types here
3
+ """
4
+ from functools import partial
5
+ from itertools import chain
6
+
7
+ import dgl
8
+ import torch
9
+ from torch.nn.utils.rnn import pad_sequence
10
+ from torch.utils.data._utils.collate import default_collate_fn_map, collate_tensor_fn, collate
11
+ import torch_geometric
12
+
13
+
14
+ def collate_pyg_fn(batch, collate_fn_map=None):
15
+ """
16
+ PyG graph collation
17
+ """
18
+ return torch_geometric.data.Batch.from_data_list(batch)
19
+
20
+
21
+ def collate_dgl_fn(batch, collate_fn_map=None):
22
+ """
23
+ DGL graph collation
24
+ """
25
+ return dgl.batch(batch)
26
+
27
+
28
+ def pad_collate_tensor_fn(batch, padding_value=0.0, collate_fn_map=None):
29
+ """
30
+ Similar to pad_packed_sequence(pack_sequence(batch, enforce_sorted=False), batch_first=True),
31
+ but additionally supports padding a list of square Tensors of size ``(L x L x ...)``.
32
+ :param batch:
33
+ :param padding_value:
34
+ :param collate_fn_map:
35
+ :return: padded_batch, lengths
36
+ """
37
+ lengths = [tensor.size(0) for tensor in batch]
38
+ if any(element != lengths[0] for element in lengths[1:]):
39
+ try:
40
+ # Tensors share at least one common dimension size, use pad_sequence
41
+ batch = pad_sequence(batch, batch_first=True, padding_value=padding_value)
42
+ except RuntimeError:
43
+ # Tensors do not share any common dimension size, find the max size of each dimension in the batch
44
+ max_sizes = [max([tensor.size(dim) for tensor in batch]) for dim in range(batch[0].dim())]
45
+ # Pad every dimension of all tensors in the batch to be the respective max size with the value
46
+ batch = collate_tensor_fn([
47
+ torch.nn.functional.pad(
48
+ tensor, tuple(chain.from_iterable(
49
+ [(0, max_sizes[dim] - tensor.size(dim)) for dim in range(tensor.dim())][::-1])
50
+ ), mode='constant', value=padding_value) for tensor in batch
51
+ ])
52
+ else:
53
+ batch = collate_tensor_fn(batch)
54
+
55
+ lengths = torch.as_tensor(lengths)
56
+ # Return the padded batch tensor and the lengths
57
+ return batch, lengths
58
+
59
+
60
+ # Join custom collate functions with the default collation map of PyTorch
61
+ COLLATE_FN_MAP = default_collate_fn_map | {
62
+ torch_geometric.data.data.BaseData: collate_pyg_fn,
63
+ dgl.DGLGraph: collate_dgl_fn,
64
+ }
65
+
66
+
67
+ def collate_fn(batch, automatic_padding=False, padding_value=0):
68
+ if automatic_padding:
69
+ COLLATE_FN_MAP.update({
70
+ torch.Tensor: partial(pad_collate_tensor_fn, padding_value=padding_value),
71
+ })
72
+ return collate(batch, collate_fn_map=COLLATE_FN_MAP)
73
+
74
+
75
+ # class VariableLengthSequence(torch.Tensor):
76
+ # """
77
+ # A custom PyTorch Tensor class that is similar to PackedSequence, except it can be directly used as a batch tensor,
78
+ # and it has an attribute called lengths, which signifies the length of each original sequence in the batch.
79
+ # """
80
+ #
81
+ # def __new__(cls, data, lengths):
82
+ # """
83
+ # Creates a new VariableLengthSequence object from the given data and lengths.
84
+ # Args:
85
+ # data (torch.Tensor): The batch collated tensor of shape (batch_size, max_length, *).
86
+ # lengths (torch.Tensor): The lengths of each original sequence in the batch of shape (batch_size,).
87
+ # Returns:
88
+ # VariableLengthSequence: A new VariableLengthSequence object.
89
+ # """
90
+ # # Check the validity of the inputs
91
+ # assert isinstance(data, torch.Tensor), "data must be a torch.Tensor"
92
+ # assert isinstance(lengths, torch.Tensor), "lengths must be a torch.Tensor"
93
+ # assert data.dim() >= 2, "data must have at least two dimensions"
94
+ # assert lengths.dim() == 1, "lengths must have one dimension"
95
+ # assert data.size(0) == lengths.size(0), "data and lengths must have the same batch size"
96
+ # assert lengths.min() > 0, "lengths must be positive"
97
+ # assert lengths.max() <= data.size(1), "lengths must not exceed the max length of data"
98
+ #
99
+ # # Create a new tensor object from data
100
+ # obj = super().__new__(cls, data)
101
+ #
102
+ # # Set the lengths attribute
103
+ # obj.lengths = lengths
104
+ #
105
+ # return obj
106
+
107
+
108
+ # class VariableLengthSequence(torch.Tensor):
109
+ # _lengths = torch.Tensor()
110
+ #
111
+ # def __new__(cls, data, lengths, *args, **kwargs):
112
+ # self = super().__new__(cls, data, *args, **kwargs)
113
+ # self.lengths = lengths
114
+ # return self
115
+ #
116
+ # def clone(self, *args, **kwargs):
117
+ # return VariableLengthSequence(super().clone(*args, **kwargs), self.lengths.clone())
118
+ #
119
+ # def new_empty(self, *size):
120
+ # return VariableLengthSequence(super().new_empty(*size), self.lengths)
121
+ #
122
+ # def to(self, *args, **kwargs):
123
+ # return VariableLengthSequence(super().to(*args, **kwargs), self.lengths.to(*args, **kwargs))
124
+ #
125
+ # def __format__(self, format_spec):
126
+ # # Convert self to a string or a number here, depending on what you need
127
+ # return self.item().__format__(format_spec)
128
+ #
129
+ # @property
130
+ # def lengths(self):
131
+ # return self._lengths
132
+ #
133
+ # @lengths.setter
134
+ # def lengths(self, lengths):
135
+ # self._lengths = lengths
136
+ #
137
+ # def cpu(self, *args, **kwargs):
138
+ # return VariableLengthSequence(super().cpu(*args, **kwargs), self.lengths.cpu(*args, **kwargs))
139
+ #
140
+ # def cuda(self, *args, **kwargs):
141
+ # return VariableLengthSequence(super().cuda(*args, **kwargs), self.lengths.cuda(*args, **kwargs))
142
+ #
143
+ # def pin_memory(self):
144
+ # return VariableLengthSequence(super().pin_memory(), self.lengths.pin_memory())
145
+ #
146
+ # def share_memory_(self):
147
+ # super().share_memory_()
148
+ # self.lengths.share_memory_()
149
+ # return self
150
+ #
151
+ # def detach_(self, *args, **kwargs):
152
+ # super().detach_(*args, **kwargs)
153
+ # self.lengths.detach_(*args, **kwargs)
154
+ # return self
155
+ #
156
+ # def detach(self, *args, **kwargs):
157
+ # return VariableLengthSequence(super().detach(*args, **kwargs), self.lengths.detach(*args, **kwargs))
158
+ #
159
+ # def record_stream(self, *args, **kwargs):
160
+ # super().record_stream(*args, **kwargs)
161
+ # self.lengths.record_stream(*args, **kwargs)
162
+ # return self
163
+
164
+
165
+ # @classmethod
166
+ # def __torch_function__(cls, func, types, args=(), kwargs=None):
167
+ # return super().__torch_function__(func, types, args, kwargs) \
168
+ # if cls.lengths is not None else torch.Tensor.__torch_function__(func, types, args, kwargs)
deepscreen/data/utils/dataset.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from numbers import Number
2
+ from typing import Literal, Union, Sequence
3
+
4
+ import pandas as pd
5
+ from sklearn.base import TransformerMixin
6
+ from sklearn.exceptions import NotFittedError
7
+ from sklearn.utils.validation import check_is_fitted
8
+ from torch.utils.data import Dataset
9
+
10
+ from deepscreen.data.utils import label_transform, FlexibleIterable
11
+
12
+
13
+ class BaseEntityDataset(Dataset):
14
+ def __init__(
15
+ self,
16
+ dataset_path: str,
17
+ use_col_prefixes=('X', 'Y', 'ID', 'U')
18
+ ):
19
+
20
+ # Read the data table header row first to filter columns and create column dtype dict
21
+ df = pd.read_csv(
22
+ dataset_path,
23
+ header=0, nrows=0,
24
+ usecols=lambda col: col.startswith(use_col_prefixes)
25
+ )
26
+ # Read the whole data table
27
+ df = pd.read_csv(
28
+ dataset_path,
29
+ header=0,
30
+ usecols=df.columns,
31
+ dtype={col: 'float32' if col.startswith('Y') else 'string' for col in df.columns}
32
+ )
33
+
34
+ self.df = df
35
+ self.label_cols = [col for col in df.columns if col.startswith('Y')]
36
+ self.label_unit_cols = [col for col in df.columns if col.startswith('U')]
37
+ self.entity_id_cols = [col for col in df.columns if col.startswith('ID')]
38
+ self.entity_cols = [col for col in df.columns if col.startswith('X')]
39
+
40
+ def __len__(self):
41
+ return len(self.df.index)
42
+
43
+ def __getitem__(self, idx):
44
+ raise NotImplementedError
45
+
46
+
47
+ # TODO test transform
48
+ class SingleEntitySingleTargetDataset(BaseEntityDataset):
49
+ def __init__(
50
+ self,
51
+ dataset_path: str,
52
+ task: Literal['regression', 'binary', 'multiclass'],
53
+ n_classes: int,
54
+ featurizer: callable,
55
+ transformer: TransformerMixin = None,
56
+ thresholds: Union[Number, Sequence[Number]] = None,
57
+ discard_intermediate: bool = None,
58
+ forward_fill: bool = True
59
+ ):
60
+ super().__init__(dataset_path)
61
+
62
+ assert len(self.entity_cols) == 1, 'The dataset contains more than 1 entity column (starting with `X`).'
63
+ if len(self.label_cols) >= 0:
64
+ assert len(self.label_cols) == 1, 'The dataset contains more than 1 label column (starting with `Y`).'
65
+ # Remove trailing `1`s in column names for flexibility
66
+ self.df.columns = self.df.columns.str.rstrip('1')
67
+
68
+ # Forward-fill non-label columns
69
+ nonlabel_cols = self.label_unit_cols + self.entity_id_cols + self.entity_cols
70
+ if forward_fill:
71
+ self.df[nonlabel_cols] = self.df[nonlabel_cols].ffill(axis=0)
72
+
73
+ # Process target labels for training/testing if exist
74
+ if self.label_cols:
75
+ # Transform target labels
76
+ self.df[self.label_cols] = self.df[self.label_cols].apply(
77
+ label_transform,
78
+ units=self.df.get('U', None),
79
+ thresholds=thresholds,
80
+ discard_intermediate=discard_intermediate).astype('float32')
81
+
82
+ # Filter out rows with a NaN in Y (missing values); use inplace to save memory
83
+ self.df.dropna(subset=self.label_cols, inplace=True)
84
+
85
+ # Validate target labels
86
+ # TODO: check sklearn.utils.multiclass.check_classification_targets
87
+ match task:
88
+ case 'regression':
89
+ assert all(self.df['Y'].apply(lambda x: isinstance(x, Number))), \
90
+ f"Y for task `regression` must be numeric; got {set(self.df['Y'].apply(type))}."
91
+ case 'binary':
92
+ assert all(self.df['Y'].isin([0, 1])), \
93
+ f"Y for task `binary` (classification) must be 0 or 1, but Y got {pd.unique(self.df['Y'])}." \
94
+ "\nYou may set `thresholds` to discretize continuous labels."
95
+ case 'multiclass':
96
+ assert n_classes >= 3, f'n_classes for task `multiclass` (classification) must be at least 3.'
97
+ assert all(self.df['Y'].apply(lambda x: x.is_integer() and x >= 0)), \
98
+ f"``Y` for task `multiclass` (classification) must be non-negative integers, " \
99
+ f"but `Y` got {pd.unique(self.df['Y'])}." \
100
+ "\nYou may set `thresholds` to discretize continuous labels."
101
+ target_n_unique = self.df['Y'].nunique()
102
+ assert target_n_unique == n_classes, \
103
+ f"You have set n_classes for task `multiclass` (classification) task to {n_classes}, " \
104
+ f"but `Y` has {target_n_unique} unique labels."
105
+
106
+ if transformer:
107
+ self.df['X'] = self.df['X'].apply(featurizer)
108
+ try:
109
+ check_is_fitted(transformer)
110
+ self.df['X'] = list(transformer.transform(self.df['X']))
111
+ except NotFittedError:
112
+ self.df['X'] = list(transformer.fit_transform(self.df['X']))
113
+
114
+ # Skip sample-wise feature extraction because it has already been done dataset-wise
115
+ self.featurizer = lambda x: x
116
+
117
+ self.featurizer = featurizer
118
+ self.n_classes = n_classes
119
+ self.df['ID'] = self.df.get('ID', self.df['X'])
120
+
121
+ def __getitem__(self, idx):
122
+ sample = self.df.loc[idx]
123
+ return {
124
+ 'X': self.featurizer(sample['X']),
125
+ 'ID': sample['ID'],
126
+ 'Y': sample.get('Y')
127
+ }
128
+
129
+
130
+ # TODO WIP
131
+ class MultiEntityMultiTargetDataset(BaseEntityDataset):
132
+ def __init__(
133
+ self,
134
+ dataset_path: str,
135
+ task: FlexibleIterable[Literal['regression', 'binary', 'multiclass']],
136
+ n_class: FlexibleIterable[int],
137
+ featurizers: FlexibleIterable[callable],
138
+ thresholds: FlexibleIterable[Union[Number, Sequence[Number]]] = None,
139
+ discard_intermediate: FlexibleIterable[bool] = None,
140
+ ):
141
+ super().__init__(dataset_path)
142
+ label_col_prefix = tuple('Y')
143
+ nonlabel_col_prefixes = tuple(('X', 'ID', 'U'))
144
+ allowed_col_prefixes = label_col_prefix + nonlabel_col_prefixes
145
+
146
+ # Read the headers first to filter columns and create column dtype dict
147
+ df = pd.read_csv(
148
+ dataset_path,
149
+ header=0, nrows=0,
150
+ usecols=lambda col: col.startswith(allowed_col_prefixes)
151
+ )
152
+
153
+ # Read the whole table
154
+ df = pd.read_csv(
155
+ dataset_path,
156
+ header=0,
157
+ usecols=df.columns,
158
+ dtype={col: 'float32' if col.startswith('Y') else 'string' for col in df.columns}
159
+ )
160
+ label_cols = [col for col in df.columns if col.startswith(label_col_prefix)]
161
+ nonlabel_cols = [col for col in df.columns if col.startswith(nonlabel_col_prefixes)]
162
+ self.entity_cols = [col for col in nonlabel_cols if col.startswith('X')]
163
+
164
+ # Forward-fill all non-label columns
165
+ df[nonlabel_cols] = df[nonlabel_cols].ffill(axis=0)
166
+
167
+ # Process target labels for training/testing
168
+ if label_cols:
169
+ # Transform target labels
170
+ df[label_cols] = df[label_cols].apply(label_transform, units=df.get('U', None), thresholds=thresholds,
171
+ discard_intermediate=discard_intermediate).astype('float32')
172
+
173
+ # Filter out rows with a NaN in Y (missing values)
174
+ df.dropna(subset=label_cols, inplace=True)
175
+
176
+ # Validate target labels
177
+ # TODO: check sklearn.utils.multiclass.check_classification_targets
178
+ # WIP
179
+ match task:
180
+ case 'regression':
181
+ assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
182
+ f"Y for task `regression` must be numeric; got {set(df['Y'].apply(type))}."
183
+ case 'binary':
184
+ assert all(df['Y'].isin([0, 1])), \
185
+ f"Y for task `binary` must be 0 or 1, but Y got {pd.unique(df['Y'])}." \
186
+ "\nYou may set `thresholds` to discretize continuous labels."
187
+ case 'multiclass':
188
+ assert len(label_cols) == len(n_class), \
189
+ (f'Data table has {len(label_cols)} label columns (`Y*`) but you have specified '
190
+ f'n_class of length {len(n_class)} for task `multiclass`.')
191
+ for label, n in zip(df[label_cols], n_class):
192
+ assert n >= 3, f'n_class for task `multiclass` must be at least 3.'
193
+ assert all(label.apply(lambda x: x.is_integer() and x >= 0)), \
194
+ f"Y for task `multiclass` must be non-negative integers, " \
195
+ f"but Y got {pd.unique(label)}." \
196
+ "\nYou may set `thresholds` to discretize continuous labels."
197
+ target_n_unique = label.nunique()
198
+ assert target_n_unique == n, \
199
+ f"You have set n_classes for task `multiclass` task to {n}, " \
200
+ f"but Y has {target_n_unique} unique labels."
201
+
202
+ self.df = df
203
+ self.featurizers = featurizers
204
+ self.n_class = n_class
205
+
206
+ def __len__(self):
207
+ return len(self.df.index)
208
+
209
+ # WIP
210
+ def __getitem__(self, idx):
211
+ sample = self.df.loc[idx]
212
+ return {
213
+ 'X': [featurizer(x) for featurizer, x in zip(self.featurizers, sample[self.entity_cols])],
214
+ 'ID': sample.get('ID', sample['X']),
215
+ 'Y': sample.get('Y')
216
+ }
deepscreen/data/utils/label.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from numbers import Number
2
+ from typing import Optional, Union
3
+
4
+ import numpy as np
5
+
6
+ from deepscreen.utils import get_logger
7
+
8
+ log = get_logger(__name__)
9
+
10
+ MOLARITY_TO_POTENCY = {
11
+ 'p': lambda x: x,
12
+ 'M': lambda x: -np.log10(x),
13
+ 'mM': lambda x: -np.log10(x) + 3,
14
+ 'μM': lambda x: -np.log10(x) + 6,
15
+ 'uM': lambda x: -np.log10(x) + 6, # in case someone doesn't know how to type micromolar lol
16
+ 'nM': lambda x: -np.log10(x) + 9,
17
+ 'pM': lambda x: -np.log10(x) + 12,
18
+ 'fM': lambda x: -np.log10(x) + 15,
19
+ }
20
+
21
+
22
+ # TODO rewrite for swifter.apply
23
+ def molar_to_p(labels, units):
24
+ assert units in MOLARITY_TO_POTENCY, f"Allowed units: {', '.join(MOLARITY_TO_POTENCY)}."
25
+
26
+ unit_converted_labels = []
27
+ for label, unit in (labels, units):
28
+ unit_converted_labels.append(MOLARITY_TO_POTENCY[unit](label))
29
+ labels = np.array(unit_converted_labels)
30
+
31
+ return labels
32
+
33
+
34
+ def label_discretize(labels, thresholds):
35
+ # if isinstance(threshold, Number):
36
+ # labels = np.where(labels < threshold, 1, 0)
37
+ # else:
38
+ # labels = np.where(labels < threshold[0], 1, np.where(labels > threshold[1], 0, np.nan))
39
+ if isinstance(thresholds, Number):
40
+ labels = 1 - np.digitize(labels, [thresholds])
41
+ else:
42
+ labels = np.digitize(labels, np.sort(thresholds)[::-1])
43
+
44
+ return labels
45
+
46
+
47
+ def label_transform(
48
+ labels,
49
+ units: Optional[list[str]],
50
+ thresholds: Optional[Union[float, list[Number]]],
51
+ discard_intermediate: Optional[bool]
52
+ ):
53
+ f"""Convert labels of all units to p scale (-log10[M]) and binarize them if specified.
54
+ :param labels: a sequence of labels, continuous or binary values
55
+ :type labels: array_like
56
+ :param units: a sequence of label units in {', '.join(MOLARITY_TO_POTENCY)}
57
+ :type units: array_like, optional
58
+ :param thresholds: discretization threshold(s) for affinity labels, in p scale (-log10[M]).
59
+ A single number maps affinities below it to 1 and otherwise to 0.
60
+ A tuple of two or more thresholds maps affinities to multiple discrete levels descendingly, assigning values
61
+ values below the lowest threshold to the highest level (e.g. 2) and values above the greatest threshold to 0
62
+ :type thresholds: list, float, optional
63
+ :param discard_intermediate: whether to discard the intermediate (indeterminate) level if provided an odd
64
+ number of thresholds (>=3)
65
+ :type discard_intermediate: bool
66
+ :return: a numpy array of affinity labels in p scale (-log10[M]) or discrete labels
67
+ """
68
+ # # Check if labels are already discrete (ignoring NAs).
69
+ # discrete = labels.dropna().isin([0, 1]).all()
70
+ #
71
+ # if discrete:
72
+ # assert discretize, "Cannot train a regression model with discrete labels."
73
+ # if thresholds:
74
+ # warn("Ignoring 'threshold' because 'Y' (labels) in the data table is already binary.")
75
+ # if units:
76
+ # warn("Ignoring 'units' because 'Y' (labels) in the data table is already binary.")
77
+ # labels = labels
78
+ if units:
79
+ labels = molar_to_p(labels, units)
80
+
81
+ if thresholds:
82
+ labels = label_discretize(labels, thresholds)
83
+ if discard_intermediate:
84
+ assert len(thresholds) % 2 == 1 and len(thresholds) >= 3, \
85
+ "Must give an odd number of (at least 3) thresholds to discard the intermediate level."
86
+ intermediate_level = len(thresholds) // 2
87
+ # Make the intermediate-level labels NaN (which will be filtered out later)
88
+ labels[labels == intermediate_level] = np.nan
89
+ # Reduce all levels above the intermediate level by 1
90
+ labels[labels > intermediate_level] -= 1
91
+
92
+ return labels
93
+
deepscreen/data/utils/sampler.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Mapping, Iterable
2
+
3
+ from torch.utils.data import BatchSampler, RandomSampler, SequentialSampler
4
+
5
+
6
+ class SafeBatchSampler(BatchSampler):
7
+ """
8
+ A safe `batch_sampler` that skips samples with `None` values, supports shuffling, and keep a fixed batch size.
9
+
10
+ Args:
11
+ data_source (Dataset): The dataset to sample from.
12
+ batch_size (int): The size of each batch.
13
+ drop_last (bool): Whether to drop the last batch if its size is smaller than `batch_size`. Defaults to `False`.
14
+ shuffle (bool, optional): Whether to shuffle the data before sampling. Defaults to `True`.
15
+
16
+ Example:
17
+ >>> dataloader = DataLoader(dataset, batch_sampler=SafeBatchSampler(dataset, batch_size, drop_last, shuffle))
18
+ """
19
+
20
+ def __init__(self, data_source, batch_size: int, drop_last: bool, shuffle: bool, sampler=None):
21
+ if not isinstance(batch_size, int) or isinstance(batch_size, bool) or \
22
+ batch_size <= 0:
23
+ raise ValueError(f"batch_size should be a positive integer value, but got batch_size={batch_size}")
24
+ if not isinstance(drop_last, bool):
25
+ raise ValueError(f"drop_last should be a boolean value, but got drop_last={drop_last}")
26
+ if sampler:
27
+ pass
28
+ elif shuffle:
29
+ sampler = RandomSampler(data_source) # type: ignore[arg-type]
30
+ else:
31
+ sampler = SequentialSampler(data_source) # type: ignore[arg-type]
32
+
33
+ super().__init__(sampler, batch_size, drop_last)
34
+ self.data_source = data_source
35
+
36
+ # def __iter__(self):
37
+ # batch = []
38
+ # for idx in self.sampler:
39
+ # sample = self.data_source[idx]
40
+ # # if isinstance(sample, list | tuple):
41
+ # # pass
42
+ # # elif isinstance(sample, dict):
43
+ # # sample = sample.values()
44
+ # # elif isinstance(sample, Series):
45
+ # # sample = sample.values
46
+ # # else:
47
+ # # sample = [sample]
48
+ # if isinstance(sample, (Iterable, Mapping)) and not isinstance(sample, str):
49
+ # if isinstance(sample, Mapping):
50
+ # sample = list(sample.values())
51
+ # else:
52
+ # sample = [sample]
53
+ #
54
+ # if all(v is not None for v in sample):
55
+ # batch.append(idx)
56
+ # if len(batch) == self.batch_size:
57
+ # yield batch
58
+ # batch = []
59
+ #
60
+ # if len(batch) > 0 and not self.drop_last:
61
+ # yield batch
62
+ #
63
+ # if not batch:
64
+ # raise StopIteration
65
+
66
+ def __iter__(self):
67
+ batch = [0] * self.batch_size
68
+ idx_in_batch = 0
69
+ for idx in self.sampler:
70
+ sample = self.data_source[idx]
71
+ if isinstance(sample, (Iterable, Mapping)) and not isinstance(sample, str):
72
+ if isinstance(sample, Mapping):
73
+ sample = sample.values()
74
+ else:
75
+ sample = [sample]
76
+
77
+ if all(v is not None for v in sample):
78
+ batch[idx_in_batch] = idx
79
+ idx_in_batch += 1
80
+ if idx_in_batch == self.batch_size:
81
+ yield batch
82
+ idx_in_batch = 0
83
+ batch = [0] * self.batch_size
84
+
85
+ if idx_in_batch > 0 and not self.drop_last:
86
+ yield batch[:idx_in_batch]
87
+
88
+ if not any(batch):
89
+ # raise StopIteration
90
+ return