jannisborn commited on
Commit
14da265
1 Parent(s): 78e0383
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Regression Transformer
3
  emoji: 💡
4
  colorFrom: green
5
  colorTo: blue
 
1
  ---
2
+ title: Molecular properties
3
  emoji: 💡
4
  colorFrom: green
5
  colorTo: blue
app.py CHANGED
@@ -2,168 +2,98 @@ import logging
2
  import pathlib
3
 
4
  import gradio as gr
 
5
  import pandas as pd
6
- from gt4sd.algorithms.conditional_generation.regression_transformer import (
7
- RegressionTransformer,
8
- )
9
- from gt4sd.algorithms.registry import ApplicationsRegistry
10
- from utils import (
11
- draw_grid_generate,
12
- draw_grid_predict,
13
- get_application,
14
- get_inference_dict,
15
- get_rt_name,
16
- )
17
 
18
  logger = logging.getLogger(__name__)
19
  logger.addHandler(logging.NullHandler())
20
 
 
 
21
 
22
- def regression_transformer(
23
- algorithm: str,
24
- task: str,
25
- target: str,
26
- number_of_samples: int,
27
- search: str,
28
- temperature: float,
29
- tolerance: int,
30
- wrapper: bool,
31
- fraction_to_mask: float,
32
- property_goal: str,
33
- tokens_to_mask: str,
34
- substructures_to_mask: str,
35
- substructures_to_keep: str,
36
- ):
37
-
38
- if task == "Predict" and wrapper:
39
- logger.warning(
40
- f"For prediction, no sampling_wrapper will be used, ignoring: fraction_to_mask: {fraction_to_mask}, "
41
- f"tokens_to_mask: {tokens_to_mask}, substructures_to_mask={substructures_to_mask}, "
42
- f"substructures_to_keep: {substructures_to_keep}."
43
- )
44
- sampling_wrapper = {}
45
- elif not wrapper:
46
- sampling_wrapper = {}
47
- else:
48
- substructures_to_mask = (
49
- []
50
- if substructures_to_mask == ""
51
- else substructures_to_mask.replace(" ", "").split(",")
52
- )
53
- substructures_to_keep = (
54
- []
55
- if substructures_to_keep == ""
56
- else substructures_to_keep.replace(" ", "").split(",")
57
- )
58
- tokens_to_mask = [] if tokens_to_mask == "" else tokens_to_mask.split(",")
59
-
60
- property_goals = {}
61
- if property_goal == "":
62
- raise ValueError(
63
- "For conditional generation you have to specify `property_goal`."
64
- )
65
- for line in property_goal.split(","):
66
- property_goals[line.split(":")[0].strip()] = float(line.split(":")[1])
67
-
68
- sampling_wrapper = {
69
- "substructures_to_keep": substructures_to_keep,
70
- "substructures_to_mask": substructures_to_mask,
71
- "text_filtering": False,
72
- "fraction_to_mask": fraction_to_mask,
73
- "property_goal": property_goals,
74
- }
75
- algorithm_application = get_application(algorithm.split(":")[0])
76
- algorithm_version = algorithm.split(" ")[-1].lower()
77
- config = algorithm_application(
78
- algorithm_version=algorithm_version,
79
- search=search.lower(),
80
- temperature=temperature,
81
- tolerance=tolerance,
82
- sampling_wrapper=sampling_wrapper,
83
  )
84
- model = RegressionTransformer(configuration=config, target=target)
85
- samples = list(model.sample(number_of_samples))
86
- if algorithm_version == "polymer" and task == "Generate":
87
- correct_samples = [(s, p) for s, p in samples if "." in s]
88
- while len(correct_samples) < number_of_samples:
89
- samples = list(model.sample(number_of_samples))
90
- correct_samples.extend(
91
- [
92
- (s, p)
93
- for s, p in samples
94
- if "." in s and (s, p) not in correct_samples
95
- ]
96
- )
97
- samples = correct_samples
98
- if task == "Predict":
99
- return draw_grid_predict(samples[0], target, domain=algorithm.split(":")[0])
100
  else:
101
- return draw_grid_generate(samples, domain=algorithm.split(":")[0])
 
 
 
 
102
 
103
 
104
  if __name__ == "__main__":
105
 
106
  # Preparation (retrieve all available algorithms)
107
- all_algos = ApplicationsRegistry.list_available()
108
- rt_algos = list(
109
- filter(lambda x: "RegressionTransformer" in x["algorithm_name"], all_algos)
110
- )
111
- rt_names = list(map(get_rt_name, rt_algos))
112
-
113
- properties = {}
114
- for algo in rt_algos:
115
- application = get_application(
116
- algo["algorithm_application"].split("Transformer")[-1]
117
- )
118
- data = get_inference_dict(
119
- application=application, algorithm_version=algo["algorithm_version"]
120
- )
121
- properties[get_rt_name(algo)] = data
122
- properties
123
 
124
  # Load metadata
125
  metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
126
 
127
- examples = pd.read_csv(
128
- metadata_root.joinpath("regression_transformer_examples.csv"), header=None
129
- ).fillna("")
 
 
 
 
 
130
 
131
- with open(metadata_root.joinpath("regression_transformer_article.md"), "r") as f:
132
  article = f.read()
133
- with open(
134
- metadata_root.joinpath("regression_transformer_description.md"), "r"
135
- ) as f:
136
  description = f.read()
137
 
138
  demo = gr.Interface(
139
- fn=regression_transformer,
140
- title="Regression Transformer",
141
  inputs=[
142
- gr.Dropdown(rt_names, label="Algorithm version", value="Molecules: Qed"),
143
- gr.Radio(choices=["Predict", "Generate"], label="Task", value="Generate"),
144
  gr.Textbox(
145
- label="Input", placeholder="CC(C#C)N(C)C(=O)NC1=CC=C(Cl)C=C1", lines=1
146
- ),
147
- gr.Slider(
148
- minimum=1, maximum=50, value=10, label="Number of samples", step=1
149
  ),
150
- gr.Radio(choices=["Sample", "Greedy"], label="Search", value="Sample"),
151
- gr.Slider(minimum=0.5, maximum=2, value=1, label="Decoding temperature"),
152
- gr.Slider(minimum=5, maximum=100, value=30, label="Tolerance", step=1),
153
- gr.Radio(choices=[True, False], label="Sampling Wrapper", value=True),
154
- gr.Slider(minimum=0, maximum=1, value=0.5, label="Fraction to mask"),
155
- gr.Textbox(label="Property goal", placeholder="<qed>:0.75", lines=1),
156
- gr.Textbox(label="Tokens to mask", placeholder="N, C", lines=1),
157
- gr.Textbox(
158
- label="Substructures to mask", placeholder="C(=O), C#C", lines=1
159
- ),
160
- gr.Textbox(
161
- label="Substructures to keep", placeholder="C1=CC=C(Cl)C=C1", lines=1
162
  ),
163
  ],
164
  outputs=gr.HTML(label="Output"),
165
  article=article,
166
  description=description,
167
- examples=examples.values.tolist(),
168
  )
169
  demo.launch(debug=True, show_error=True)
 
2
  import pathlib
3
 
4
  import gradio as gr
5
+ import numpy as np
6
  import pandas as pd
7
+ from gt4sd.properties.molecules import MOLECULE_PROPERTY_PREDICTOR_FACTORY
8
+
9
+ from utils import draw_grid_predict
 
 
 
 
 
 
 
 
10
 
11
  logger = logging.getLogger(__name__)
12
  logger.addHandler(logging.NullHandler())
13
 
14
+ REMOVE = ["docking", "docking_tdc", "molecule_one", "askcos", "plogp"]
15
+ REMOVE.extend(["similarity_seed", "activity_against_target", "organtox"])
16
 
17
+ MODEL_PROP_DESCRIPTION = {
18
+ "Tox21": "NR-AR, NR-AR-LBD, NR-AhR, NR-Aromatase, NR-ER, NR-ER-LBD, NR-PPAR-gamma, SR-ARE, SR-ATAD5, SR-HSE, SR-MMP, SR-p53",
19
+ "Sider": "Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal disorders,Gastrointestinal disorders,Social circumstances,Immune system disorders,Reproductive system and breast disorders,Bening & malignant,General disorders,Endocrine disorders,Surgical & medical procedures,Vascular disorders,Blood & lymphatic disorders,Skin & subcutaneous disorders,Congenital & genetic disorders,Infections,Respiratory & thoracic disorders,Psychiatric disorders,Renal & urinary disorders,Pregnancy conditions,Ear disorders,Cardiac disorders,Nervous system disorders,Injury & procedural complications",
20
+ "Clintox": "FDA approval, Clinical trial failure",
21
+ }
22
+
23
+
24
+ def main(property: str, smiles: str, smiles_file: str):
25
+
26
+ algo, config = MOLECULE_PROPERTY_PREDICTOR_FACTORY[property.lower()]
27
+ kwargs = (
28
+ {"algorithm_version": "v0"} if property in MODEL_PROP_DESCRIPTION.keys() else {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  )
30
+ model = algo(config(**kwargs))
31
+ if smiles is not None and smiles_file is not None:
32
+ raise ValueError("Pass either smiles or smiles_file, not both.")
33
+ elif smiles is not None:
34
+ smiles = [smiles]
35
+ elif smiles_file is not None:
36
+ smiles = pd.read_csv(smiles_file.name, header=None, sep="\t")[0].tolist()
37
+ props = np.array(list(map(model, smiles))).round(2)
38
+
39
+ # Expand to 2D array if needed
40
+ if len(props.shape) == 1:
41
+ props = np.expand_dims(np.array(props), -1)
42
+
43
+ if property in MODEL_PROP_DESCRIPTION.keys():
44
+ property_names = MODEL_PROP_DESCRIPTION[property].split(",")
 
45
  else:
46
+ property_names = [property]
47
+
48
+ return draw_grid_predict(
49
+ smiles, props, property_names=property_names, domain="Molecules"
50
+ )
51
 
52
 
53
  if __name__ == "__main__":
54
 
55
  # Preparation (retrieve all available algorithms)
56
+ properties = list(MOLECULE_PROPERTY_PREDICTOR_FACTORY.keys())[::-1]
57
+ for prop in REMOVE:
58
+ prop_to_idx = dict(zip(properties, range(len(properties))))
59
+ properties.pop(prop_to_idx[prop])
60
+ properties = list(map(lambda x: x.capitalize(), properties))
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  # Load metadata
63
  metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
64
 
65
+ examples = [
66
+ ["Qed", None, metadata_root.joinpath("examples.smi")],
67
+ [
68
+ "Esol",
69
+ "CN1CCN(CCCOc2ccc(N3C(=O)C(=Cc4ccc(Oc5ccc([N+](=O)[O-])cc5)cc4)SC3=S)cc2)CC1",
70
+ None,
71
+ ],
72
+ ]
73
 
74
+ with open(metadata_root.joinpath("article.md"), "r") as f:
75
  article = f.read()
76
+ with open(metadata_root.joinpath("description.md"), "r") as f:
 
 
77
  description = f.read()
78
 
79
  demo = gr.Interface(
80
+ fn=main,
81
+ title="Molecular properties",
82
  inputs=[
83
+ gr.Dropdown(properties, label="Property", value="qed"),
 
84
  gr.Textbox(
85
+ label="Single SMILES",
86
+ placeholder="CC(C#C)N(C)C(=O)NC1=CC=C(Cl)C=C1",
87
+ lines=1,
 
88
  ),
89
+ gr.File(
90
+ file_types=[".smi"],
91
+ label="Multiple SMILES (tab-separated, `.smi` file)",
 
 
 
 
 
 
 
 
 
92
  ),
93
  ],
94
  outputs=gr.HTML(label="Output"),
95
  article=article,
96
  description=description,
97
+ examples=examples,
98
  )
99
  demo.launch(debug=True, show_error=True)
model_cards/article.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Supported molecular properties
2
+
3
+
4
+ ### ClinTox
5
+ A [ToxSmi model](https://github.com/PaccMann/toxsmi) trained on [ClinTox](https://moleculenet.org/datasets-1) dataset which has two endpoints: Probability of FDA approval and Probability of failure in clinical trials. When using this model, please cite [*Born et al. (2023)](#toxsmi-citation).
6
+
7
+ ### SIDER
8
+ A [ToxSmi model](https://github.com/PaccMann/toxsmi) trained on the [SIDER](https://moleculenet.org/datasets-1) dataset for 27 different types of side effects of drugs. When using this model, please cite [*Born et al. (2023)](#toxsmi-citation).
9
+
10
+ ### Tox21
11
+ A [ToxSmi model](https://github.com/PaccMann/toxsmi) trained on the [Tox21](https://tripod.nih.gov/tox/) dataset with 12 different types of environmental toxicities. When using this model, please cite [*Born et al. (2023)](#toxsmi-citation).
12
+
13
+ ### SCScore
14
+ Predict the synthetic complexity score (SCScore) as presented in [Coley et al. (*J. Chem. Inf. Model.*; 2018)](https://pubs.acs.org/doi/full/10.1021/acs.jcim.7b00622).
15
+
16
+ ### SAS
17
+ Estimate the synthetic accessibility score (SAS) as presented in [Ertl et al. (*Journal of Chemoinformatics*; 2009)](https://jcheminf.biomedcentral.com/articles/10.1186/1758-2946-1-8).
18
+
19
+ ### Lipinski
20
+ Measure whether a molecule confirms to the Lipinski-rule-of-five as presented in [Lipinski et al. (*Advanced Drug Delivery Reviews*; 2001)](https://www.sciencedirect.com/science/article/abs/pii/S0169409X00001290?via%3Dihub).
21
+
22
+ ### Penalized logP
23
+ Measure the penalized logP (partition coefficient) score as presented in [Gomez-Bombarelli et al. (*ACS Central Science*; 2018)](https://arxiv.org/abs/1610.02415v1). This is the logP minus the number of rings with > 6 atoms minus the SAS.
24
+
25
+ ### QED
26
+ Measure the drug-likeness as presented in [Bickerton et al. (*Nature Chemistry*; 2012)](https://www.nature.com/articles/nchem.1243).
27
+
28
+ ### LogP
29
+ Measure the logP (partition coefficient) of a molecule as presented in [Wildman et al. (*J. Chem. Inf. Comput. Sci.*; 1999)](https://pubs.acs.org/doi/full/10.1021/ci990307l).
30
+
31
+ ### Bertz
32
+ Calculate the total polar surface area of a molecule as presented in [Ertl et al. (*Journal of Medicinal Chemistry*; 2000)](https://pubs.acs.org/doi/full/10.1021/jm000942e).
33
+
34
+ ### TPSA
35
+ Calculate the first general index of molecular complexity [Bertz (*Journal of the American Chemical Society*; 1981)](https://pubs.acs.org/doi/pdf/10.1021/ja00402a071).
36
+
37
+ ### Is-Scaffold
38
+ Whether the molecule is identical to its [Murcko scaffold](https://rdkit.org/docs/source/rdkit.Chem.Scaffolds.MurckoScaffold.html).
39
+
40
+ ### Number-Of-X
41
+ Calculated with [RDKit](https://www.rdkit.org/docs/source/rdkit.Chem.rdchem.html).
42
+
43
+ ### Molecular Weight
44
+ Calculated with [RDKit](https://www.rdkit.org/docs/source/rdkit.Chem.rdchem.html).
45
+
46
+
47
+ ### ToxSmi citation
48
+ ```bib
49
+ @article{born2023chemical,
50
+ title={Chemical representation learning for toxicity prediction},
51
+ author={Born, Jannis and Markert, Greta and Janakarajan, Nikita and Kimber, Talia B. and Volkamer, Andrea and Rodriguez Martinez, Maria and Manica, Matteo},
52
+ journal={Under review at Digital Discovery},
53
+ year={2023}
54
+ }
55
+ ```
56
+
57
+
58
+ ### Unsupported properties
59
+ The following molecular properties are available via the GT4SD API but not in this UI:
60
+ - [MoleculeOne](https://tdcommons.ai/functions/oracles/#moleculeone) endpoint for retrosynthesis
61
+ - [ASKCOS](https://tdcommons.ai/functions/oracles/#askcos) endpoint for retrosynthesis
62
+ - [TDC-Docking](https://tdcommons.ai/functions/oracles/#docking-scores) endpoint for docking against a user-provided target
63
+ - [TDC-Docking](https://tdcommons.ai/functions/oracles/#docking-scores) endpoint for docking against *3pbl*.
64
+ - [Protein-ligand binding](https://tdcommons.ai/functions/oracles/#dopamine-receptor-d2-drd2) against one of the targets *drd2*, *gsk3b*, *jnk3*, *fpscores*, *cyp3a4_veith*, *drd2_current*, *gsk3b_current* or *jnk3_current*.
65
+ - [Tanimoto similarity](https://tdcommons.ai/functions/oracles/#similaritydissimilarity) to a seed molecule.
66
+
67
+
68
+ Moreover, GT4SD also includes properties on other entities such as [proteins](https://gt4sd.github.io/gt4sd-core/api/gt4sd.properties.proteins.html) and [crystals](https://gt4sd.github.io/gt4sd-core/api/gt4sd.properties.crystals.html).
model_cards/description.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+
3
+ <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
4
+
5
+ ### Molecular property prediction
6
+
7
+ This is the GT4SD web-app for prediction of various molecular properties. For **examples** and **documentation** of the supported properties, please see below. Please note that this API does not expose **all** properties that are supported in GT4SD (a list of the non-supported ones can be found at the bottom).
model_cards/examples.smi ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Cc1cc2c(c3oc(CCCC#N)cc13)C(=O)c1c(O)cccc1C2=O
2
+ C=CCN1C(=O)C(=NNC(=S)NC2OC(COC(C)=O)C(OC(C)=O)C(OC(C)=O)C2OC(C)=O)c2ccccc21
3
+ O=C1C(=Cc2ccc(F)cc2)CCOc2c1ccc1ccccc21
4
+ CC(C)CNc1cc(NCC(C)C)nc(NCC(C)C)n1
5
+ CN1CCN(CCCOc2ccc(N3C(=O)C(=Cc4ccc(Oc5ccc([N+](=O)[O-])cc5)cc4)SC3=S)cc2)CC1
6
+ COc1ccc2ccccc2c1C1CC1NC(C)=O
7
+ Cc1ccc(-n2c(=O)[nH]cc(C(=O)Nc3ccc4c(c3)OCCO4)c2=O)cc1
8
+ Cc1ccc(NCc2nnc(SCC(=O)NCCc3ccccc3)n2C)cc1
9
+ CCCNC(=O)c1ccc2c(c1)N=C(C)c1c(C)ccc(C)c1S2
10
+ COc1ccc(Cn2ccn(CC(=O)Nc3cc(C)ccc3C)c(=O)c2=O)cc1
11
+ Cn1nccc1C(=O)NN=Cc1c(O)ccc2ccccc12
12
+ CCOC(=O)Nc1cc(N)c2c(n1)NC(C)C(c1ccccc1)=N2
13
+ Cn1nc(N)c2ncc(C(Cl)(Cl)Cl)nc21
model_cards/regression_transformer.png DELETED
Binary file (225 kB)
 
model_cards/regression_transformer_article.md DELETED
@@ -1,113 +0,0 @@
1
- # Model documentation & parameters
2
-
3
- ## Parameters
4
-
5
- ### Algorithm Version
6
- Which model checkpoint to use (trained on different datasets).
7
-
8
- ### Task
9
- Whether the multitask model should be used for property prediction or conditional generation (default).
10
-
11
- ### Input
12
- The input sequence. In the default setting (where `Task` is *Generate* and `Sampling Wrapper` is *True*) this can be a seed SMILES (for the molecule models) or amino-acid sequence (for the protein models). The model will locally adapt the seed sequence by masking `Fraction to mask` of the tokens.
13
- If the `Task` is *Predict*, the sequences are given as SELFIES for the molecule models. Moreover, the tokens that should be predicted (`[MASK]` in the input) have to be given explicitly. Populate the examples to understand better.
14
- NOTE: When setting `Task` to *Generate*, and `Sampling Wrapper` to *False*, the user has maximal control about the generative process and can explicitly decide which tokens should be masked.
15
-
16
- ### Number of samples
17
- How many samples should be generated (between 1 and 50). If `Task` is *Predict*, this has to be set to 1.
18
-
19
- ### Search
20
- Decoding search method. Use *Sample* if `Task` is *Generate*. If `Task` is *Predict*, use *Greedy*.
21
-
22
- ### Tolerance
23
- Precision tolerance; only used if `Task` is *Generate*. This is a single float between 0 and 100 for the the tolerated deviation between desired/primed property and predicted property of the generated molecule. Given in percentage with respect to the property range encountered during training.
24
- NOTE: The tolerance is *only* used for post-hoc filtering of the generated samples.
25
-
26
- ### Sampling Wrapper
27
- Only used if `Task` is *Generate*. If set to *False*, the user has to provide a full RT-sequence as `Input` and has to **explicitly** decide which tokens are masked (see example below). This gives full control but is tedious. Instead, if `Sampling Wrapper` is set to *True*, the RT stochastically determines which parts of the sequence are masked.
28
- **NOTE**: All below arguments only apply if `Sampling Wrapper` is *True*.
29
-
30
- #### Fraction to mask
31
- Specifies the ratio of tokens that can be changed by the model. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
32
-
33
- #### Property goal
34
- Specifies the desired target properties for the generation. Need to be given in the format `<prop>:value`. If the model supports multiple properties, give them separated by a comma `,`. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
35
-
36
- #### Tokens to mask
37
- Optionally specifies which tokens (atoms, bonds etc) can be masked. Please separate multiple tokens by comma (`,`). If not specified, all tokens can be masked. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
38
-
39
- #### Substructures to mask
40
- Optionally specifies a list of substructures that should *definitely* be masked (excluded from stochastic masking). Given in SMILES format. If multiple are provided, separate by comma (`,`). Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
41
- *NOTE*: Most models operate on SELFIES and the matching of the substructures occurs in SELFIES simply on a string level.
42
-
43
- #### Substructures to keep
44
- Optionally specifies a list of substructures that should definitely be present in the target sample (i.e., excluded from stochastic masking). Given in SMILES format. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
45
- *NOTE*: This keeps tokens even if they are included in `tokens_to_mask`.
46
- *NOTE*: Most models operate on SELFIES and the matching of the substructures occurs in SELFIES simply on a string level.
47
-
48
-
49
-
50
- # Model card -- Regression Transformer
51
-
52
- **Model Details**: The [Regression Transformer](https://arxiv.org/abs/2202.01338) is a multitask Transformer that reformulates regression as a conditional sequence modeling task. This yields a dichotomous language model that seamlessly integrates property prediction with property-driven conditional generation.
53
-
54
- **Developers**: Jannis Born and Matteo Manica from IBM Research.
55
-
56
- **Distributors**: Original authors' code wrapped and distributed by GT4SD Team (2023) from IBM Research.
57
-
58
- **Model date**: Preprint released in 2022, currently under review at *Nature Machine Intelligence*.
59
-
60
- **Algorithm version**: Models trained and distributed by the original authors.
61
- - **Molecules: QED**: Model trained on 1.6M molecules (SELFIES) from ChEMBL and their QED scores.
62
- - **Molecules: Solubility**: QED model finetuned on the ESOL dataset from [Delaney et al (2004), *J. Chem. Inf. Comput. Sci.*](https://pubs.acs.org/doi/10.1021/ci034243x) to predict water solubility. Model trained on augmented SELFIES.
63
- - **Molecules: USPTO**: Model trained on 2.8M [chemical reactions](https://figshare.com/articles/dataset/Chemical_reactions_from_US_patents_1976-Sep2016_/5104873) from the US patent office. The model used SELFIES and a synthetic property (total molecular weight of all precursors).
64
- - **Molecules: Polymer**: Model finetuned on 600 ROPs (ring-opening polymerizations) with monomer-catalyst pairs. Model used three properties: conversion (`<conv>`), PDI (`<pdi>`) and Molecular Weight (`<molwt>`). Model trained with augmented SELFIES, optimized only to generate catalysts, given a monomer and the property constraints. See the example for details.
65
- - **Molecules: Cosmo_acdl**: Model finetuned on 56k molecules with two properties (*pKa_ACDL* and *pKa_COSMO*). Model used augmented SELFIES.
66
- - **Molecules: Pfas**: Model finetuned on ~1k PFAS (Perfluoroalkyl and Polyfluoroalkyl Substances) molecules with 9 properties including some experimentally measured ones (biodegradability, LD50 etc) and some synthetic ones (SCScore, molecular weight). Model trained on augmented SELFIES.
67
- - **Molecules: Logp_and_synthesizability**: Model trained on 2.9M molecules (SELFIES) from PubChem with **two** synthetic properties, the logP (partition coefficient) and the [SCScore by Coley et al. (2018); *J. Chem. Inf. Model.*](https://pubs.acs.org/doi/full/10.1021/acs.jcim.7b00622?casa_token=JZzOrdWlQ_QAAAAA%3A3_ynCfBJRJN7wmP2gyAR0EWXY-pNW_l-SGwSSU2SGfl5v5SxcvqhoaPNDhxq4THberPoyyYqTZELD4Ck)
68
- - **Molecules: Crippen_logp**: Model trained on 2.9M molecules (SMILES) from PubChem, but *only* on logP (partition coefficient).
69
- - **Proteins: Stability**: Model pretrained on 2.6M peptides from UniProt with the Boman index as property. Finetuned on the [**Stability**](https://www.science.org/doi/full/10.1126/science.aan0693) dataset from the [TAPE benchmark](https://proceedings.neurips.cc/paper/2019/hash/37f65c068b7723cd7809ee2d31d7861c-Abstract.html) which has ~65k samples.
70
-
71
- **Model type**: A Transformer-based language model that is trained on alphanumeric sequence to simultaneously perform sequence regression or conditional sequence generation.
72
-
73
- **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
74
- All models are trained with an alternated training scheme that alternated between optimizing the cross-entropy loss on the property tokens ("regression") or the self-consistency objective on the molecular tokens. See the [Regression Transformer](https://arxiv.org/abs/2202.01338) paper for details.
75
-
76
- **Paper or other resource for more information**:
77
- The [Regression Transformer](https://arxiv.org/abs/2202.01338) paper. See the [source code](https://github.com/IBM/regression-transformer) for details.
78
-
79
- **License**: MIT
80
-
81
- **Where to send questions or comments about the model**: Open an issue on [GT4SD repository](https://github.com/GT4SD/gt4sd-core).
82
-
83
- **Intended Use. Use cases that were envisioned during development**: Chemical research, in particular drug discovery.
84
-
85
- **Primary intended uses/users**: Researchers and computational chemists using the model for model comparison or research exploration purposes.
86
-
87
- **Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
88
-
89
- **Factors**: Not applicable.
90
-
91
- **Metrics**: High predictive power for the properties of that specific algorithm version.
92
-
93
- **Datasets**: Different ones, as described under **Algorithm version**.
94
-
95
- **Ethical Considerations**: No specific considerations as no private/personal data is involved. Please consult with the authors in case of questions.
96
-
97
- **Caveats and Recommendations**: Please consult with original authors in case of questions.
98
-
99
- Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
100
-
101
-
102
- ## Citation
103
-
104
- ```bib
105
- @article{born2022regression,
106
- title={Regression Transformer: Concurrent Conditional Generation and Regression by Blending Numerical and Textual Tokens},
107
- author={Born, Jannis and Manica, Matteo},
108
- journal={arXiv preprint arXiv:2202.01338},
109
- note={Spotlight talk at ICLR workshop on Machine Learning for Drug Discovery},
110
- year={2022}
111
- }
112
- ```
113
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model_cards/regression_transformer_description.md DELETED
@@ -1,13 +0,0 @@
1
-
2
-
3
- <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
4
-
5
- ### Concurrent sequence regression and generation for molecular language modeling
6
-
7
- The [Regression Transformer](https://arxiv.org/abs/2202.01338) is a multitask Transformer that reformulates regression as a conditional sequence modeling task.
8
- This yields a dichotomous language model that seamlessly integrates property prediction with property-driven conditional generation. For details see the [arXiv preprint](https://arxiv.org/abs/2202.01338), the [development code](https://github.com/IBM/regression-transformer) and the [GT4SD endpoint](https://github.com/GT4SD/gt4sd-core) for inference.
9
-
10
- Each `algorithm_version` refers to one trained model. Each model can be used for **two tasks**, either to *predict* one (or multiple) properties of a molecule or to *generate* a molecule (given a seed molecule and a property constraint).
11
-
12
- For **examples** and **documentation** of the model parameters, please see below.
13
- Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model_cards/regression_transformer_examples.csv DELETED
@@ -1,9 +0,0 @@
1
- Molecules: Logp_and_synthesizability,Generate,CCOC1=NC=NC(=C1C)NCCOC(C)C,3,Sample,1.2,20,True,0.3,"<logp>:0.390, <scs>:2.628",N,(C)C,CCO
2
- Molecules: Qed,Generate,CC(C#C)N(C)C(=O)NC1=CC=C(Cl)C=C1,10,Sample,1.0,30,True,0.5,<qed>:0.75,"N, C","C(=O), CC",C1=CC=C(Cl)C=C1
3
- Molecules: Logp_and_synthesizability,Predict,<logp>[MASK][MASK][MASK][MASK][MASK]|<scs>[MASK][MASK][MASK][MASK][MASK]|[C][C][O][C][=N][C][=N][C][Branch1_2][Branch1_1][=C][Ring1][Branch1_2][C][N][C][C][O][C][Branch1_1][C][C][C],1,Greedy,1.0,30,False,0.0,,,,
4
- Proteins: Stability,Predict,<stab>[MASK][MASK][MASK][MASK][MASK]|GSQEVNSGTQTYKNASPEEAERIARKAGATTWTEKGNKWEIRI,1,Greedy,1.0,1,False,0.0,,,,
5
- Proteins: Stability,Generate,GSQEVNSGTQTYKNASPEEAERIARKAGATTWTEKGNKWEIRI,10,Sample,1.2,30,True,0.3,<stab>:0.393,,SQEVNSGTQTYKN,WTEK
6
- Molecules: Qed,Generate,<qed>0.717|[MASK][MASK][MASK][MASK][MASK][C][Branch2_1][Ring1][Ring1][MASK][MASK][=C][C][Branch1_1][C][C][=N][C][MASK][MASK][=C][C][=C][Ring1][O][Ring1][Branch1_2][=C][Ring2][MASK][MASK],10,Sample,1.2,30,False,0.0,,,,
7
- Molecules: Solubility,Generate,ClC(Cl)C(Cl)Cl,5,Sample,1.3,40,True,0.4,<esol>:0.754,,,
8
- Molecules: Polymer,Predict,<conv>[MASK][MASK][MASK][MASK]|<pdi>[MASK][MASK][MASK][MASK][MASK]|<molwt>[MASK][MASK][MASK][MASK][MASK]|[C][Branch1_2][C][=O][O][C@@Hexpl][Branch1_1][C][C][C][Branch1_2][C][=O][O][C@Hexpl][Ring1][Branch2_2][C].[C][C][C][Branch2_1][Ring1][Ring1][N][C][Branch1_1][=C][N][C][=C][C][=C][Branch1_1][Ring1][O][C][C][=C][Ring1][Branch2_1][=S][C][C][C][Ring2][Ring1][C],1,Greedy,1,0,False,,,,,
9
- Molecules: Polymer,Generate,C1(=O)O[C@@H](C)C(=O)O[C@H]1C.C2CC(NC(NC1=CC=C(OC)C=C1)=S)CCC2,10,Sample,1.3,50,True,0.5,"<pdi>:3.490, <conv>:0.567, <molwt>:3.567",,,C1(=O)O[C@@H](C)C(=O)O[C@H]1C
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -8,7 +8,7 @@ torch-sparse
8
  torch-geometric
9
  torchvision==0.13.1
10
  torchaudio==0.12.1
11
- gt4sd>=1.0.6
12
  molgx>=0.22.0a1
13
  molecule_generation
14
  nglview
 
8
  torch-geometric
9
  torchvision==0.13.1
10
  torchaudio==0.12.1
11
+ gt4sd>=1.1.1
12
  molgx>=0.22.0a1
13
  molecule_generation
14
  nglview
utils.py CHANGED
@@ -1,136 +1,25 @@
1
- import json
2
  import logging
3
- import os
4
- from collections import defaultdict
5
- from typing import Dict, List, Tuple
6
-
7
  import mols2grid
8
  import pandas as pd
9
- from gt4sd.algorithms import (
10
- RegressionTransformerMolecules,
11
- RegressionTransformerProteins,
12
- )
13
- from gt4sd.algorithms.core import AlgorithmConfiguration
14
  from rdkit import Chem
15
- from terminator.selfies import decoder
16
 
17
  logger = logging.getLogger(__name__)
18
  logger.addHandler(logging.NullHandler())
19
 
20
 
21
- def get_application(application: str) -> AlgorithmConfiguration:
22
- """
23
- Convert application name to AlgorithmConfiguration.
24
-
25
- Args:
26
- application: Molecules or Proteins
27
-
28
- Returns:
29
- The corresponding AlgorithmConfiguration
30
- """
31
- if application == "Molecules":
32
- application = RegressionTransformerMolecules
33
- elif application == "Proteins":
34
- application = RegressionTransformerProteins
35
- else:
36
- raise ValueError(
37
- "Currently only models for molecules and proteins are supported"
38
- )
39
- return application
40
-
41
-
42
- def get_inference_dict(
43
- application: AlgorithmConfiguration, algorithm_version: str
44
- ) -> Dict:
45
- """
46
- Get inference dictionary for a given application and algorithm version.
47
-
48
- Args:
49
- application: algorithm application (Molecules or Proteins)
50
- algorithm_version: algorithm version (e.g. qed)
51
-
52
- Returns:
53
- A dictionary with the inference parameters.
54
- """
55
- config = application(algorithm_version=algorithm_version)
56
- with open(os.path.join(config.ensure_artifacts(), "inference.json"), "r") as f:
57
- data = json.load(f)
58
- return data
59
-
60
-
61
- def get_rt_name(x: Dict) -> str:
62
- """
63
- Get the UI display name of the regression transformer.
64
-
65
- Args:
66
- x: dictionary with the inference parameters
67
-
68
- Returns:
69
- The display name
70
- """
71
- return (
72
- x["algorithm_application"].split("Transformer")[-1]
73
- + ": "
74
- + x["algorithm_version"].capitalize()
75
- )
76
-
77
-
78
- def draw_grid_predict(prediction: str, target: str, domain: str) -> str:
79
- """
80
- Uses mols2grid to draw a HTML grid for the prediction
81
-
82
- Args:
83
- prediction: Predicted sequence.
84
- target: Target molecule
85
- domain: Domain of the prediction (molecules or proteins)
86
-
87
- Returns:
88
- HTML to display
89
- """
90
-
91
- if domain not in ["Molecules", "Proteins"]:
92
- raise ValueError(f"Unsupported domain {domain}")
93
-
94
- seq = target.split("|")[-1]
95
- converter = (
96
- decoder
97
- if domain == "Molecules"
98
- else lambda x: Chem.MolToSmiles(Chem.MolFromFASTA(x))
99
- )
100
- try:
101
- seq = converter(seq)
102
- except Exception:
103
- logger.warning(f"Could not draw sequence {seq}")
104
-
105
- result = {"SMILES": [seq], "Name": ["Target"]}
106
- # Add properties
107
- for prop in prediction.split("<")[1:]:
108
- result[
109
- prop.split(">")[0]
110
- ] = f"{prop.split('>')[0].capitalize()} = {prop.split('>')[1]}"
111
- result_df = pd.DataFrame(result)
112
- obj = mols2grid.display(
113
- result_df,
114
- tooltip=list(result.keys()),
115
- height=900,
116
- n_cols=1,
117
- name="Results",
118
- size=(600, 700),
119
- )
120
- return obj.data
121
-
122
-
123
- def draw_grid_generate(
124
- samples: List[Tuple[str]], domain: str, n_cols: int = 5, size=(140, 200)
125
  ) -> str:
126
  """
127
- Uses mols2grid to draw a HTML grid for the generated molecules
128
 
129
  Args:
130
- samples: The generated samples (with properties)
131
- domain: Domain of the prediction (molecules or proteins)
132
- n_cols: Number of columns in grid. Defaults to 5.
133
- size: Size of molecule in grid. Defaults to (140, 200).
134
 
135
  Returns:
136
  HTML to display
@@ -140,29 +29,25 @@ def draw_grid_generate(
140
  raise ValueError(f"Unsupported domain {domain}")
141
 
142
  if domain == "Proteins":
143
- try:
144
- smis = list(
145
- map(lambda x: Chem.MolToSmiles(Chem.MolFromFASTA(x[0])), samples)
146
- )
147
- except Exception:
148
- logger.warning(f"Could not convert some sequences {samples}")
149
  else:
150
- smis = [s[0] for s in samples]
151
 
152
- result = defaultdict(list)
153
- result.update({"SMILES": smis, "Name": [f"sample_{i}" for i in range(len(smis))]})
154
-
155
- # Create properties
156
- properties = [s.split("<")[1] for s in samples[0][1].split(">")[:-1]]
157
- # Fill properties
158
- for sample in samples:
159
- for prop in properties:
160
- value = float(sample[1].split(prop)[-1][1:].split("<")[0])
161
- result[prop].append(f"{prop} = {value}")
162
 
163
- result_df = pd.DataFrame(result)
 
 
 
 
164
  obj = mols2grid.display(
165
- result_df,
166
  tooltip=list(result.keys()),
167
  height=1100,
168
  n_cols=n_cols,
 
 
1
  import logging
2
+ from typing import List
3
+ import numpy as np
 
 
4
  import mols2grid
5
  import pandas as pd
 
 
 
 
 
6
  from rdkit import Chem
 
7
 
8
  logger = logging.getLogger(__name__)
9
  logger.addHandler(logging.NullHandler())
10
 
11
 
12
+ def draw_grid_predict(
13
+ sequences: List[str], properties: np.array, property_names: List[str], domain: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  ) -> str:
15
  """
16
+ Uses mols2grid to draw a HTML grid for the prediction
17
 
18
  Args:
19
+ sequences: Sequences for which properties are predicted.
20
+ properties: Predicted properties. Array of shape (n_samples, n_properties).
21
+ names: List of property names
22
+ domain: Domain of the prediction (molecules or proteins).
23
 
24
  Returns:
25
  HTML to display
 
29
  raise ValueError(f"Unsupported domain {domain}")
30
 
31
  if domain == "Proteins":
32
+ converter = lambda x: Chem.MolToSmiles(Chem.MolFromFASTA(x))
 
 
 
 
 
33
  else:
34
+ converter = lambda x: x
35
 
36
+ smiles = []
37
+ for sequence in sequences:
38
+ try:
39
+ seq = converter(sequence)
40
+ smiles.append(seq)
41
+ except Exception:
42
+ logger.warning(f"Could not draw sequence {seq}")
 
 
 
43
 
44
+ result = pd.DataFrame({"SMILES": smiles})
45
+ for i, name in enumerate(property_names):
46
+ result[name] = properties[:, i]
47
+ n_cols = min(3, len(result))
48
+ size = (140, 200) if len(result) > 3 else (600, 700)
49
  obj = mols2grid.display(
50
+ result,
51
  tooltip=list(result.keys()),
52
  height=1100,
53
  n_cols=n_cols,