jannisborn commited on
Commit
1634315
1 Parent(s): 39bc9b9
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: GT4SD - MoLeR
3
  emoji: 💡
4
  colorFrom: green
5
  colorTo: blue
@@ -9,7 +9,7 @@ app_file: app.py
9
  pinned: false
10
  python_version: 3.8.13
11
  pypi_version: 20.2.4
12
- duplicated_from: jannisborn/gt4sd-moler
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: GT4SD - Polymer Blocks
3
  emoji: 💡
4
  colorFrom: green
5
  colorTo: blue
 
9
  pinned: false
10
  python_version: 3.8.13
11
  pypi_version: 20.2.4
12
+ duplicated_from: jannisborn/gt4sd-paccmann-gp
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,39 +1,31 @@
1
  import logging
2
  import pathlib
3
-
4
  import gradio as gr
5
  import pandas as pd
6
- from gt4sd.algorithms.generation.moler import MoLeR, MoLeRDefaultGenerator
 
 
 
7
 
8
  from gt4sd.algorithms.registry import ApplicationsRegistry
 
9
  from utils import draw_grid_generate
10
 
11
  logger = logging.getLogger(__name__)
12
  logger.addHandler(logging.NullHandler())
13
 
14
- TITLE = "MoLeR"
15
 
 
16
 
17
- def run_inference(
18
- algorithm_version: str,
19
- scaffolds: str,
20
- beam_size: int,
21
- number_of_samples: int,
22
- seed: int,
23
- ):
24
- config = MoLeRDefaultGenerator(
25
  algorithm_version=algorithm_version,
26
- scaffolds=scaffolds,
27
- beam_size=beam_size,
28
- num_samples=4,
29
- seed=seed,
30
- num_workers=1,
31
  )
32
- model = MoLeR(configuration=config)
33
  samples = list(model.sample(number_of_samples))
34
 
35
- seed_mols = [] if scaffolds == "" else scaffolds.split(".")
36
- return draw_grid_generate(seed_mols, samples)
37
 
38
 
39
  if __name__ == "__main__":
@@ -42,7 +34,9 @@ if __name__ == "__main__":
42
  all_algos = ApplicationsRegistry.list_available()
43
  algos = [
44
  x["algorithm_version"]
45
- for x in list(filter(lambda x: TITLE in x["algorithm_name"], all_algos))
 
 
46
  ]
47
 
48
  # Load metadata
@@ -59,19 +53,19 @@ if __name__ == "__main__":
59
 
60
  demo = gr.Interface(
61
  fn=run_inference,
62
- title="MoLeR (MOlecule-LEvel Representation)",
63
  inputs=[
64
  gr.Dropdown(algos, label="Algorithm version", value="v0"),
65
- gr.Textbox(
66
- label="Scaffolds",
67
- placeholder="CC(C#C)N(C)C(=O)NC1=CC=C(Cl)C=C1",
68
- lines=1,
 
 
69
  ),
70
- gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Beam_size"),
71
  gr.Slider(
72
  minimum=1, maximum=50, value=10, label="Number of samples", step=1
73
  ),
74
- gr.Number(value=42, label="Seed", precision=0),
75
  ],
76
  outputs=gr.HTML(label="Output"),
77
  article=article,
 
1
  import logging
2
  import pathlib
 
3
  import gradio as gr
4
  import pandas as pd
5
+ from gt4sd.algorithms.generation.polymer_blocks import (
6
+ PolymerBlocksGenerator,
7
+ PolymerBlocks,
8
+ )
9
 
10
  from gt4sd.algorithms.registry import ApplicationsRegistry
11
+
12
  from utils import draw_grid_generate
13
 
14
  logger = logging.getLogger(__name__)
15
  logger.addHandler(logging.NullHandler())
16
 
 
17
 
18
+ def run_inference(algorithm_version: str, length: float, number_of_samples: int):
19
 
20
+ config = PolymerBlocksGenerator(
 
 
 
 
 
 
 
21
  algorithm_version=algorithm_version,
22
+ batch_size=32,
23
+ generated_length=length,
 
 
 
24
  )
25
+ model = PolymerBlocks(config)
26
  samples = list(model.sample(number_of_samples))
27
 
28
+ return draw_grid_generate(samples=samples, n_cols=5, seeds=[])
 
29
 
30
 
31
  if __name__ == "__main__":
 
34
  all_algos = ApplicationsRegistry.list_available()
35
  algos = [
36
  x["algorithm_version"]
37
+ for x in list(
38
+ filter(lambda x: "PolymerBlocks" in x["algorithm_name"], all_algos)
39
+ )
40
  ]
41
 
42
  # Load metadata
 
53
 
54
  demo = gr.Interface(
55
  fn=run_inference,
56
+ title="Polymer Blocks",
57
  inputs=[
58
  gr.Dropdown(algos, label="Algorithm version", value="v0"),
59
+ gr.Slider(
60
+ minimum=5,
61
+ maximum=400,
62
+ value=100,
63
+ label="Maximal sequence length",
64
+ step=1,
65
  ),
 
66
  gr.Slider(
67
  minimum=1, maximum=50, value=10, label="Number of samples", step=1
68
  ),
 
69
  ],
70
  outputs=gr.HTML(label="Output"),
71
  article=article,
model_cards/article.md CHANGED
@@ -1,37 +1,36 @@
1
  # Model documentation & parameters
2
 
3
- **Algorithm Version**: Which model checkpoint to use (trained on different datasets).
4
 
5
- **Scaffolds**: One or multiple scaffolds (or seed molecules), provided as '.'-separated SMILES. If empty, no scaffolds are used.
6
 
7
  **Number of samples**: How many samples should be generated (between 1 and 50).
8
 
9
- **Beam size**: Beam size used in beam search decoding (the higher the slower but better).
10
 
11
- **Seed**: The random seed used for initialization.
12
 
 
13
 
14
- # Model card
15
 
16
- **Model Details**: MoLeR is a graph-based molecular generative model that can be conditioned (primed) on scaffolds. The model decorates scaffolds with realistic structural motifs.
17
 
18
- **Developers**: Krzysztof Maziarz and co-authors from Microsoft Research and Novartis (full reference at bottom).
19
 
20
- **Distributors**: Developer's code wrapped and distributed by GT4SD Team (2023) from IBM Research.
21
 
22
- **Model date**: Released around March 2022.
23
 
24
- **Model version**: Model provided by original authors, see [their GitHub repo](https://github.com/microsoft/molecule-generation).
25
 
26
- **Model type**: An encoder-decoder-based GNN for molecular generation.
 
27
 
28
- **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**: Trained by the original authors with the default parameters provided [on GitHub](https://github.com/microsoft/molecule-generation).
29
-
30
- **Paper or other resource for more information**: [Learning to Extend Molecular Scaffolds with Structural Motifs (ICLR 2022)](https://openreview.net/forum?id=ZTsoE8G3GG).
31
 
32
  **License**: MIT
33
 
34
- **Where to send questions or comments about the model**: Open an issue on original author's [GitHub repository](https://github.com/microsoft/molecule-generation).
35
 
36
  **Intended Use. Use cases that were envisioned during development**: Chemical research, in particular drug discovery.
37
 
@@ -39,11 +38,9 @@
39
 
40
  **Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
41
 
42
- **Factors**: Not applicable.
43
-
44
- **Metrics**: Validation loss on decoding correct molecules. Evaluated on several downstream tasks.
45
 
46
- **Datasets**: 1.5M drug-like molecules from GuacaMol benchmark. Finetuning on 20 molecular optimization tasks from GuacaMol.
47
 
48
  **Ethical Considerations**: Unclear, please consult with original authors in case of questions.
49
 
@@ -52,14 +49,12 @@
52
  Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
53
 
54
  ## Citation
55
-
56
  ```bib
57
- @inproceedings{maziarz2021learning,
58
- author={Krzysztof Maziarz and Henry Richard Jackson{-}Flux and Pashmina Cameron and
59
- Finton Sirockin and Nadine Schneider and Nikolaus Stiefl and Marwin H. S. Segler and Marc Brockschmidt},
60
- title = {Learning to Extend Molecular Scaffolds with Structural Motifs},
61
- booktitle = {The Tenth International Conference on Learning Representations, {ICLR}},
62
- year = {2022}
63
  }
64
- ```
65
-
 
1
  # Model documentation & parameters
2
 
3
+ **Algorithm Version**: Which model version to use.
4
 
5
+ **Maximal sequence length**: The maximal number of SMILES tokens in the generated molecule.
6
 
7
  **Number of samples**: How many samples should be generated (between 1 and 50).
8
 
 
9
 
 
10
 
11
+ # Model card -- PolymerBlocks
12
 
13
+ **Model Details**: *PolymerBlocks* is a sequence-based molecular generator tuned to generate blocks of polymers (e.g., catalysts and monomers). The model relies on a Variational Autoencoder architecture as described in [Born et al. (2021; *iScience*)](https://www.sciencedirect.com/science/article/pii/S2589004221002376)
14
 
15
+ **Developers**: Matteo Manica and colleagues from IBM Research.
16
 
17
+ **Distributors**: Original authors' code integrated into GT4SD.
18
 
19
+ **Model date**: Not yet published.
20
 
21
+ **Model version**: Only initial model version.
22
 
23
+ **Model type**: A sequence-based molecular generator tuned to generate blocks of polymers (e.g., catalysts and monomers).
24
 
25
+ **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
26
+ N.A.
27
 
28
+ **Paper or other resource for more information**:
29
+ TBD
 
30
 
31
  **License**: MIT
32
 
33
+ **Where to send questions or comments about the model**: Open an issue on [GT4SD repository](https://github.com/GT4SD/gt4sd-core).
34
 
35
  **Intended Use. Use cases that were envisioned during development**: Chemical research, in particular drug discovery.
36
 
 
38
 
39
  **Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
40
 
41
+ **Metrics**: N.A.
 
 
42
 
43
+ **Datasets**: N.A.
44
 
45
  **Ethical Considerations**: Unclear, please consult with original authors in case of questions.
46
 
 
49
  Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
50
 
51
  ## Citation
52
+ TBD, temporarily please cite:
53
  ```bib
54
+ @article{manica2022gt4sd,
55
+ title={GT4SD: Generative Toolkit for Scientific Discovery},
56
+ author={Manica, Matteo and Cadow, Joris and Christofidellis, Dimitrios and Dave, Ashish and Born, Jannis and Clarke, Dean and Teukam, Yves Gaetan Nana and Hoffman, Samuel C and Buchan, Matthew and Chenthamarakshan, Vijil and others},
57
+ journal={arXiv preprint arXiv:2207.03928},
58
+ year={2022}
 
59
  }
60
+ ```
 
model_cards/description.md CHANGED
@@ -1,6 +1,6 @@
1
  <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
2
 
3
- MoLeR ([Maziarz et al., (2022), *ICLR*](https://openreview.net/forum?id=ZTsoE8G3GG)) is a graph-based molecular generative model that can be conditioned (primed) on scaffolds. This model is provided and distributed by the **GT4SD** (Generative Toolkit for Scientific Discovery).
4
 
5
  For **examples** and **documentation** of the model parameters, please see below.
6
  Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.
 
1
  <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
2
 
3
+ *PolymerBlocks* is a sequence-based molecular generator tuned to generate blocks of polymers (e.g., catalysts and monomers). The model relies on a Variational Autoencoder architecture as described in [Born et al. (2021; *iScience*)](https://www.sciencedirect.com/science/article/pii/S2589004221002376)
4
 
5
  For **examples** and **documentation** of the model parameters, please see below.
6
  Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.
model_cards/examples.csv CHANGED
@@ -1,5 +1 @@
1
- v0,,1,4,0
2
- v0,CC(=O)NC1=NC2=CC(OCC3=CC=CN(CC4=CC=C(Cl)C=C4)C3=O)=CC=C2N1,1,10,0
3
- v0,C12C=CC=NN1C(C#CC1=C(C)C=CC3C(NC4=CC(C(F)(F)F)=CC=C4)=NOC1=3)=CN=2.CCO,3,5,5
4
-
5
-
 
1
+ v0,100,10
 
 
 
 
requirements.txt CHANGED
@@ -8,7 +8,7 @@ torch-sparse
8
  torch-geometric
9
  torchvision==0.13.1
10
  torchaudio==0.12.1
11
- gt4sd>=1.0.0
12
  molgx>=0.22.0a1
13
  molecule_generation
14
  nglview
 
8
  torch-geometric
9
  torchvision==0.13.1
10
  torchaudio==0.12.1
11
+ gt4sd>=1.0.5
12
  molgx>=0.22.0a1
13
  molecule_generation
14
  nglview
utils.py CHANGED
@@ -1,21 +1,17 @@
1
- import json
2
  import logging
3
- import os
4
  from collections import defaultdict
5
- from typing import Dict, List, Tuple
6
 
7
  import mols2grid
8
  import pandas as pd
9
- from rdkit import Chem
10
- from terminator.selfies import decoder
11
 
12
  logger = logging.getLogger(__name__)
13
  logger.addHandler(logging.NullHandler())
14
 
15
 
16
  def draw_grid_generate(
17
- seeds: List[str],
18
  samples: List[str],
 
19
  n_cols: int = 3,
20
  size=(140, 200),
21
  ) -> str:
 
 
1
  import logging
 
2
  from collections import defaultdict
3
+ from typing import List
4
 
5
  import mols2grid
6
  import pandas as pd
 
 
7
 
8
  logger = logging.getLogger(__name__)
9
  logger.addHandler(logging.NullHandler())
10
 
11
 
12
  def draw_grid_generate(
 
13
  samples: List[str],
14
+ seeds: List[str] = [],
15
  n_cols: int = 3,
16
  size=(140, 200),
17
  ) -> str: