jannisborn commited on
Commit
1298030
0 Parent(s):

Duplicate from jannisborn/gt4sd-advanced-manufacturing

Browse files
Files changed (10) hide show
  1. .gitattributes +34 -0
  2. .gitignore +1 -0
  3. LICENSE +21 -0
  4. README.md +15 -0
  5. app.py +102 -0
  6. model_cards/article.md +68 -0
  7. model_cards/description.md +6 -0
  8. model_cards/examples.csv +2 -0
  9. requirements.txt +29 -0
  10. utils.py +48 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Generative Toolkit 4 Scientific Discovery
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: GT4SD - Advanced Manufacturing
3
+ emoji: 💡
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.9.1
8
+ app_file: app.py
9
+ pinned: false
10
+ python_version: 3.8.13
11
+ pypi_version: 20.2.4
12
+ duplicated_from: jannisborn/gt4sd-advanced-manufacturing
13
+ ---
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import pathlib
3
+ import gradio as gr
4
+ import pandas as pd
5
+ from gt4sd.algorithms.controlled_sampling.advanced_manufacturing import (
6
+ CatalystGenerator,
7
+ AdvancedManufacturing,
8
+ )
9
+ from gt4sd.algorithms.registry import ApplicationsRegistry
10
+
11
+ from utils import draw_grid_generate
12
+
13
+ logger = logging.getLogger(__name__)
14
+ logger.addHandler(logging.NullHandler())
15
+
16
+
17
+ def run_inference(
18
+ algorithm_version: str,
19
+ target_binding_energy: float,
20
+ primer_smiles: str,
21
+ length: float,
22
+ number_of_points: int,
23
+ number_of_steps: int,
24
+ number_of_samples: int,
25
+ ):
26
+
27
+ config = CatalystGenerator(
28
+ algorithm_version=algorithm_version,
29
+ number_of_points=number_of_points,
30
+ number_of_steps=number_of_steps,
31
+ generated_length=length,
32
+ primer_smiles=primer_smiles,
33
+ )
34
+ model = AdvancedManufacturing(config, target=target_binding_energy)
35
+ samples = list(model.sample(number_of_samples))
36
+ seeds = [] if primer_smiles == "" else [primer_smiles]
37
+
38
+ return draw_grid_generate(samples=samples, n_cols=5, seeds=seeds)
39
+
40
+
41
+ if __name__ == "__main__":
42
+
43
+ # Preparation (retrieve all available algorithms)
44
+ all_algos = ApplicationsRegistry.list_available()
45
+ algos = [
46
+ x["algorithm_version"]
47
+ for x in list(
48
+ filter(lambda x: "AdvancedManufact" in x["algorithm_name"], all_algos)
49
+ )
50
+ ]
51
+
52
+ # Load metadata
53
+ metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
54
+
55
+ examples = pd.read_csv(metadata_root.joinpath("examples.csv"), header=None).fillna(
56
+ ""
57
+ )
58
+ print("Examples: ", examples.values.tolist())
59
+
60
+ with open(metadata_root.joinpath("article.md"), "r") as f:
61
+ article = f.read()
62
+ with open(metadata_root.joinpath("description.md"), "r") as f:
63
+ description = f.read()
64
+
65
+ demo = gr.Interface(
66
+ fn=run_inference,
67
+ title="Advanced Manufacturing",
68
+ inputs=[
69
+ gr.Dropdown(
70
+ algos,
71
+ label="Algorithm version",
72
+ value="v0",
73
+ ),
74
+ gr.Slider(minimum=1, maximum=100, value=10, label="Target binding energy"),
75
+ gr.Textbox(
76
+ label="Primer SMILES",
77
+ placeholder="FP(F)F.CP(C)c1ccccc1.[Au]",
78
+ lines=1,
79
+ ),
80
+ gr.Slider(
81
+ minimum=5,
82
+ maximum=400,
83
+ value=100,
84
+ label="Maximal sequence length",
85
+ step=1,
86
+ ),
87
+ gr.Slider(
88
+ minimum=16, maximum=128, value=32, label="Number of points", step=1
89
+ ),
90
+ gr.Slider(
91
+ minimum=16, maximum=128, value=50, label="Number of steps", step=1
92
+ ),
93
+ gr.Slider(
94
+ minimum=1, maximum=50, value=10, label="Number of samples", step=1
95
+ ),
96
+ ],
97
+ outputs=gr.HTML(label="Output"),
98
+ article=article,
99
+ description=description,
100
+ # examples=examples.values.tolist(),
101
+ )
102
+ demo.launch(debug=True, show_error=True)
model_cards/article.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model documentation & parameters
2
+
3
+ **Algorithm Version**: Which model version to use.
4
+
5
+ **Target binding energy**: The desired binding energy.
6
+
7
+ **Primer SMILES**: A SMILES string used to prime the generation.
8
+
9
+ **Maximal sequence length**: The maximal number of SMILES tokens in the generated molecule.
10
+
11
+ **Number of points**: Number of points to sample with the Gaussian Process.
12
+
13
+ **Number of steps**: Number of optimization steps in the Gaussian Process optimization.
14
+
15
+ **Number of samples**: How many samples should be generated (between 1 and 50).
16
+
17
+
18
+
19
+ # Model card -- AdvancedManufacturing
20
+
21
+ **Model Details**: *AdvancedManufacturing* is a sequence-based molecular generator tuned to generate catalysts. The model relies on a recurrent Variational Autoencoder with a binding-energy predictor trained on the latent code. The framework uses Gaussian Processes for generating targeted molecules.
22
+
23
+ **Developers**: Oliver Schilter and colleagues from IBM Research.
24
+
25
+ **Distributors**: Original authors' code integrated into GT4SD.
26
+
27
+ **Model date**: Not yet published.
28
+
29
+ **Model version**: Different types of models trained on NCCR data using SMILES or SELFIES, potentially also with augmentation.
30
+
31
+ **Model type**: A sequence-based molecular generator tuned to generate catalysts. The model relies on a recurrent Variational Autoencoder with a binding-energy predictor trained on the latent code. The framework uses Gaussian Processes for generating targeted molecules.
32
+
33
+ **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
34
+ N.A.
35
+
36
+ **Paper or other resource for more information**:
37
+ TBD
38
+
39
+ **License**: MIT
40
+
41
+ **Where to send questions or comments about the model**: Open an issue on [GT4SD repository](https://github.com/GT4SD/gt4sd-core).
42
+
43
+ **Intended Use. Use cases that were envisioned during development**: Chemical research, in particular drug discovery.
44
+
45
+ **Primary intended uses/users**: Researchers and computational chemists using the model for model comparison or research exploration purposes.
46
+
47
+ **Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
48
+
49
+ **Metrics**: N.A.
50
+
51
+ **Datasets**: Data provided through NCCR.
52
+
53
+ **Ethical Considerations**: Unclear, please consult with original authors in case of questions.
54
+
55
+ **Caveats and Recommendations**: Unclear, please consult with original authors in case of questions.
56
+
57
+ Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
58
+
59
+ ## Citation
60
+ TBD, temporarily please cite:
61
+ ```bib
62
+ @article{manica2022gt4sd,
63
+ title={GT4SD: Generative Toolkit for Scientific Discovery},
64
+ author={Manica, Matteo and Cadow, Joris and Christofidellis, Dimitrios and Dave, Ashish and Born, Jannis and Clarke, Dean and Teukam, Yves Gaetan Nana and Hoffman, Samuel C and Buchan, Matthew and Chenthamarakshan, Vijil and others},
65
+ journal={arXiv preprint arXiv:2207.03928},
66
+ year={2022}
67
+ }
68
+ ```
model_cards/description.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
2
+
3
+ *AdvancedManufacturing* is a sequence-based molecular generator tuned to generate catalysts. The model relies on a Variational Autoencoder with a binding-energy predictor trained on the latent code. The framework uses Gaussian Processes for generating targeted molecules.
4
+
5
+ For **examples** and **documentation** of the model parameters, please see below.
6
+ Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.
model_cards/examples.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ v0,10,,100,10,50,10
2
+ v0,10,FP(F)F.CP(C)c1ccccc1.[Au],100,10,50,10
requirements.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -f https://download.pytorch.org/whl/cpu/torch_stable.html
2
+ -f https://data.pyg.org/whl/torch-1.12.1+cpu.html
3
+ # pip==20.2.4
4
+ torch==1.12.1
5
+ torch-scatter
6
+ torch-spline-conv
7
+ torch-sparse
8
+ torch-geometric
9
+ torchvision==0.13.1
10
+ torchaudio==0.12.1
11
+ gt4sd>=1.0.5
12
+ molgx>=0.22.0a1
13
+ molecule_generation
14
+ nglview
15
+ PyTDC==0.3.7
16
+ gradio==3.12.0
17
+ markdown-it-py>=2.1.0
18
+ mols2grid>=0.2.0
19
+ numpy==1.23.5
20
+ pandas>=1.0.0
21
+ terminator @ git+https://github.com/IBM/regression-transformer@gt4sd
22
+ guacamol_baselines @ git+https://github.com/GT4SD/guacamol_baselines.git@v0.0.2
23
+ moses @ git+https://github.com/GT4SD/moses.git@v0.1.0
24
+ paccmann_chemistry @ git+https://github.com/PaccMann/paccmann_chemistry@0.0.4
25
+ paccmann_generator @ git+https://github.com/PaccMann/paccmann_generator@0.0.2
26
+ paccmann_gp @ git+https://github.com/PaccMann/paccmann_gp@0.1.1
27
+ paccmann_omics @ git+https://github.com/PaccMann/paccmann_omics@0.0.1.1
28
+ paccmann_predictor @ git+https://github.com/PaccMann/paccmann_predictor@sarscov2
29
+ reinvent_models @ git+https://github.com/GT4SD/reinvent_models@v0.0.1
utils.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from collections import defaultdict
3
+ from typing import List
4
+
5
+ import mols2grid
6
+ import pandas as pd
7
+
8
+ logger = logging.getLogger(__name__)
9
+ logger.addHandler(logging.NullHandler())
10
+
11
+
12
+ def draw_grid_generate(
13
+ samples: List[str],
14
+ seeds: List[str] = [],
15
+ n_cols: int = 3,
16
+ size=(140, 200),
17
+ ) -> str:
18
+ """
19
+ Uses mols2grid to draw a HTML grid for the generated molecules
20
+
21
+ Args:
22
+ samples: The generated samples.
23
+ n_cols: Number of columns in grid. Defaults to 5.
24
+ size: Size of molecule in grid. Defaults to (140, 200).
25
+
26
+ Returns:
27
+ HTML to display
28
+ """
29
+
30
+ result = defaultdict(list)
31
+ result.update(
32
+ {
33
+ "SMILES": seeds + samples,
34
+ "Name": [f"Seed_{i}" for i in range(len(seeds))]
35
+ + [f"Generated_{i}" for i in range(len(samples))],
36
+ },
37
+ )
38
+
39
+ result_df = pd.DataFrame(result)
40
+ obj = mols2grid.display(
41
+ result_df,
42
+ tooltip=list(result.keys()),
43
+ height=1100,
44
+ n_cols=n_cols,
45
+ name="Results",
46
+ size=size,
47
+ )
48
+ return obj.data