jannisborn commited on
Commit
895a807
1 Parent(s): 2543308

wip: initial version

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Generative Toolkit 4 Scientific Discovery
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,14 @@
1
  ---
2
- title: Gt4sd Moler
3
- emoji: 👁
4
- colorFrom: indigo
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 3.16.0
8
  app_file: app.py
9
  pinned: false
10
- license: mit
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: MoLeR
3
+ emoji: 💡
4
+ colorFrom: green
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 3.9.1
8
  app_file: app.py
9
  pinned: false
10
+ python_version: 3.8.13
11
+ pypi_version: 20.2.4
12
  ---
13
 
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import pathlib
3
+
4
+ import gradio as gr
5
+ import pandas as pd
6
+ from gt4sd.algorithms.generation.moler import MoLeR, MoLeRDefaultGenerator
7
+
8
+ from gt4sd.algorithms.registry import ApplicationsRegistry
9
+ from utils import draw_grid_generate
10
+
11
+ logger = logging.getLogger(__name__)
12
+ logger.addHandler(logging.NullHandler())
13
+
14
+ TITLE = "MoLeR"
15
+
16
+
17
+ def run_inference(
18
+ algorithm_version: str,
19
+ scaffolds: str,
20
+ beam_size: int,
21
+ number_of_samples: int,
22
+ seed: int,
23
+ ):
24
+ config = MoLeRDefaultGenerator(
25
+ algorithm_version=algorithm_version,
26
+ scaffolds=scaffolds,
27
+ beam_size=beam_size,
28
+ num_samples=4,
29
+ seed=seed,
30
+ )
31
+ model = MoLeR(configuration=config)
32
+ samples = list(model.sample(number_of_samples))
33
+
34
+ draw_grid_generate(samples)
35
+
36
+
37
+ if __name__ == "__main__":
38
+
39
+ # Preparation (retrieve all available algorithms)
40
+ all_algos = ApplicationsRegistry.list_available()
41
+ algos = [
42
+ x["algorithm_version"]
43
+ for x in list(filter(lambda x: TITLE in x["algorithm_name"], all_algos))
44
+ ]
45
+
46
+ # Load metadata
47
+ metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
48
+
49
+ examples = pd.read_csv(metadata_root.joinpath("examples.csv"), header=None).fillna(
50
+ ""
51
+ )
52
+
53
+ with open(metadata_root.joinpath("article.md"), "r") as f:
54
+ article = f.read()
55
+ with open(metadata_root.joinpath("description.md"), "r") as f:
56
+ description = f.read()
57
+
58
+ demo = gr.Interface(
59
+ fn=run_inference,
60
+ title=TITLE,
61
+ inputs=[
62
+ gr.Dropdown(algos, label="Algorithm version", value="v0"),
63
+ gr.Textbox(
64
+ label="Scaffolds",
65
+ placeholder="CC(C#C)N(C)C(=O)NC1=CC=C(Cl)C=C1",
66
+ lines=1,
67
+ ),
68
+ gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Beam_size"),
69
+ gr.Slider(
70
+ minimum=1, maximum=50, value=10, label="Number of samples", step=1
71
+ ),
72
+ gr.Number(value=42, label="Seed", precision=0),
73
+ ],
74
+ outputs=gr.HTML(label="Output"),
75
+ article=article,
76
+ description=description,
77
+ examples=examples.values.tolist(),
78
+ )
79
+ demo.launch(debug=True, show_error=True)
model_cards/article.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model card -- MoLeR
2
+
3
+ ## Parameters
4
+
5
+ ### Algorithm Version:
6
+ Which model checkpoint to use (trained on different datasets).
7
+
8
+ ### Number of samples
9
+ How many samples should be generated (between 1 and 50).
10
+
11
+
12
+ ## Citation
13
+
14
+ ```bib
15
+ @inproceedings{maziarz2021learning,
16
+ author={Krzysztof Maziarz and Henry Richard Jackson{-}Flux and Pashmina Cameron and
17
+ Finton Sirockin and Nadine Schneider and Nikolaus Stiefl and Marwin H. S. Segler and Marc Brockschmidt},
18
+ title = {Learning to Extend Molecular Scaffolds with Structural Motifs},
19
+ booktitle = {The Tenth International Conference on Learning Representations, {ICLR}},
20
+ year = {2022}
21
+ }
22
+ ```
23
+
model_cards/description.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+ ### Concurrent sequence regression and generation for molecular language modeling
3
+
4
+ The RT is a multitask Transformer that reformulates regression as a conditional sequence modeling task.
5
+ This yields a dichotomous language model that seamlessly integrates property prediction with property-driven conditional generation. For details see the [arXiv preprint](https://arxiv.org/abs/2202.01338), the [development code](https://github.com/IBM/regression-transformer) and the [GT4SD endpoint](https://github.com/GT4SD/gt4sd-core) for inference.
6
+
7
+ Each `algorithm_version` refers to one trained model. Each model can be used for **two tasks**, either to *predict* one (or multiple) properties of a molecule or to *generate* a molecule (given a seed molecule and a property constraint).
model_cards/examples.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ v0,,1,4,0
2
+ v0,CC(=O)NC1=NC2=CC(OCC3=CC=CN(CC4=CC=C(Cl)C=C4)C3=O)=CC=C2N1,1,10,0
3
+ v0,C12C=CC=NN1C(C#CC1=C(C)C=CC3C(NC4=CC(C(F)(F)F)=CC=C4)=NOC1=3)=CN=2,3,5,5
4
+
5
+
requirements.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -f https://download.pytorch.org/whl/cpu/torch_stable.html
2
+ -f https://data.pyg.org/whl/torch-1.12.1+cpu.html
3
+ # pip==20.2.4
4
+ torch==1.12.1
5
+ torch-scatter
6
+ torch-spline-conv
7
+ torch-sparse
8
+ torch-geometric
9
+ torchvision==0.13.1
10
+ torchaudio==0.12.1
11
+ gt4sd>=1.0.0
12
+ molgx>=0.22.0a1
13
+ molecule_generation
14
+ nglview
15
+ PyTDC==0.3.7
16
+ gradio==3.12.0
17
+ markdown-it-py>=2.1.0
18
+ mols2grid>=0.2.0
19
+ pandas>=1.0.0
20
+ terminator @ git+https://github.com/IBM/regression-transformer@gt4sd
21
+ guacamol_baselines @ git+https://github.com/GT4SD/guacamol_baselines.git@v0.0.2
22
+ moses @ git+https://github.com/GT4SD/moses.git@v0.1.0
23
+ paccmann_chemistry @ git+https://github.com/PaccMann/paccmann_chemistry@0.0.4
24
+ paccmann_generator @ git+https://github.com/PaccMann/paccmann_generator@0.0.2
25
+ paccmann_gp @ git+https://github.com/PaccMann/paccmann_gp@0.1.1
26
+ paccmann_omics @ git+https://github.com/PaccMann/paccmann_omics@0.0.1.1
27
+ paccmann_predictor @ git+https://github.com/PaccMann/paccmann_predictor@sarscov2
28
+ reinvent_models @ git+https://github.com/GT4SD/reinvent_models@v0.0.1
utils.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ from collections import defaultdict
5
+ from typing import Dict, List, Tuple
6
+
7
+ import mols2grid
8
+ import pandas as pd
9
+ from rdkit import Chem
10
+ from terminator.selfies import decoder
11
+
12
+ logger = logging.getLogger(__name__)
13
+ logger.addHandler(logging.NullHandler())
14
+
15
+
16
+ def draw_grid_generate(
17
+ samples: List[Tuple[str]],
18
+ domain: str = "molecules",
19
+ n_cols: int = 5,
20
+ size=(140, 200),
21
+ ) -> str:
22
+ """
23
+ Uses mols2grid to draw a HTML grid for the generated molecules
24
+
25
+ Args:
26
+ samples: The generated samples.
27
+ n_cols: Number of columns in grid. Defaults to 5.
28
+ size: Size of molecule in grid. Defaults to (140, 200).
29
+
30
+ Returns:
31
+ HTML to display
32
+ """
33
+
34
+ result = defaultdict(list)
35
+ result.update(
36
+ {"SMILES": samples, "Name": [f"sample_{i}" for i in range(len(samples))]}
37
+ )
38
+
39
+ result_df = pd.DataFrame(result)
40
+ print('RESTULT', result_df)
41
+ obj = mols2grid.display(
42
+ result_df,
43
+ tooltip=list(result.keys()),
44
+ height=1100,
45
+ n_cols=n_cols,
46
+ name="Results",
47
+ size=size,
48
+ )
49
+ return obj.data