paccmann

Running

App Files Files Community

jannisborn commited on Jan 31, 2023

Commit

8e66b23

0 Parent(s):

Duplicate from GT4SD/paccmann_gp

Browse files

Files changed (10) hide show

.gitattributes +34 -0
.gitignore +1 -0
LICENSE +21 -0
README.md +15 -0
app.py +164 -0
model_cards/article.md +89 -0
model_cards/description.md +6 -0
model_cards/examples.csv +3 -0
requirements.txt +30 -0
utils.py +76 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 Generative Toolkit 4 Scientific Discovery
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+---
+title: PaccMann^GP
+emoji: 💡
+colorFrom: green
+colorTo: blue
+sdk: gradio
+sdk_version: 3.9.1
+app_file: app.py
+pinned: false
+python_version: 3.8.13
+pypi_version: 20.2.4
+duplicated_from: GT4SD/paccmann_gp
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import logging
+import pathlib
+from typing import List
+import gradio as gr
+import pandas as pd
+from gt4sd.algorithms.controlled_sampling.paccmann_gp import (
+    PaccMannGPGenerator,
+    PaccMannGP,
+)
+from gt4sd.algorithms.controlled_sampling.paccmann_gp.implementation import (
+    MINIMIZATION_FUNCTIONS,
+)
+from gt4sd.algorithms.registry import ApplicationsRegistry
+from utils import draw_grid_generate
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+MINIMIZATION_FUNCTIONS.pop("callable", None)
+MINIMIZATION_FUNCTIONS.pop("molwt", None)
+def run_inference(
+    algorithm_version: str,
+    targets: List[str],
+    protein_target: str,
+    temperature: float,
+    length: float,
+    number_of_samples: int,
+    limit: int,
+    number_of_steps: int,
+    number_of_initial_points: int,
+    number_of_optimization_rounds: int,
+    sampling_variance: float,
+    samples_for_evaluation: int,
+    maximum_number_of_sampling_steps: int,
+    seed: int,
+):
+    config = PaccMannGPGenerator(
+        algorithm_version=algorithm_version.split("_")[-1],
+        batch_size=32,
+        temperature=temperature,
+        generated_length=length,
+        limit=limit,
+        acquisition_function="EI",
+        number_of_steps=number_of_steps,
+        number_of_initial_points=number_of_initial_points,
+        initial_point_generator="random",
+        number_of_optimization_rounds=number_of_optimization_rounds,
+        sampling_variance=sampling_variance,
+        samples_for_evaluation=samples_for_evaluation,
+        maximum_number_of_sampling_steps=maximum_number_of_sampling_steps,
+        seed=seed,
+    )
+    target = {i: {} for i in targets}
+    if "affinity" in targets:
+        if protein_target == "" or not isinstance(protein_target, str):
+            raise ValueError(
+                f"Protein target must be specified for affinity prediction, not ={protein_target}"
+            )
+        target["affinity"]["protein"] = protein_target
+    else:
+        protein_target = ""
+    model = PaccMannGP(config, target=target)
+    samples = list(model.sample(number_of_samples))
+    return draw_grid_generate(
+        samples=samples,
+        n_cols=5,
+        properties=set(target.keys()),
+        protein_target=protein_target,
+    )
+if __name__ == "__main__":
+    # Preparation (retrieve all available algorithms)
+    all_algos = ApplicationsRegistry.list_available()
+    algos = [
+        x["algorithm_version"]
+        for x in list(filter(lambda x: "PaccMannGP" in x["algorithm_name"], all_algos))
+    ]
+    # Load metadata
+    metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
+    examples = pd.read_csv(
+        metadata_root.joinpath("examples.csv"), header=None, sep="|"
+    ).fillna("")
+    examples[1] = examples[1].apply(eval)
+    with open(metadata_root.joinpath("article.md"), "r") as f:
+        article = f.read()
+    with open(metadata_root.joinpath("description.md"), "r") as f:
+        description = f.read()
+    demo = gr.Interface(
+        fn=run_inference,
+        title="PaccMannGP",
+        inputs=[
+            gr.Dropdown(algos, label="Algorithm version", value="v0"),
+            gr.CheckboxGroup(
+                choices=list(MINIMIZATION_FUNCTIONS.keys()),
+                value=["qed"],
+                multiselect=True,
+                label="Property goals",
+            ),
+            gr.Textbox(
+                label="Protein target",
+                placeholder="MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTT",
+                lines=1,
+            ),
+            gr.Slider(minimum=0.5, maximum=2, value=1, label="Decoding temperature"),
+            gr.Slider(
+                minimum=5,
+                maximum=400,
+                value=100,
+                label="Maximal sequence length",
+                step=1,
+            ),
+            gr.Slider(
+                minimum=1, maximum=50, value=10, label="Number of samples", step=1
+            ),
+            gr.Slider(minimum=1, maximum=8, value=4.0, label="Limit"),
+            gr.Slider(minimum=1, maximum=32, value=8, label="Number of steps", step=1),
+            gr.Slider(
+                minimum=1, maximum=32, value=4, label="Number of initial points", step=1
+            ),
+            gr.Slider(
+                minimum=1,
+                maximum=4,
+                value=1,
+                label="Number of optimization rounds",
+                step=1,
+            ),
+            gr.Slider(minimum=0.01, maximum=1, value=0.1, label="Sampling variance"),
+            gr.Slider(
+                minimum=1,
+                maximum=10,
+                value=1,
+                label="Samples used for evaluation",
+                step=1,
+            ),
+            gr.Slider(
+                minimum=1,
+                maximum=64,
+                value=4,
+                label="Maximum number of sampling steps",
+                step=1,
+            ),
+            gr.Number(value=42, label="Seed", precision=0),
+        ],
+        outputs=gr.HTML(label="Output"),
+        article=article,
+        description=description,
+        examples=examples.values.tolist(),
+    )
+    demo.launch(debug=True, show_error=True)

model_cards/article.md ADDED Viewed

	@@ -0,0 +1,89 @@

+# Model documentation & parameters
+**Algorithm Version**: Which model version to use.
+**Property goals**: One or multiple properties that will be optimized.
+**Protein target**: An AAS of a protein target used for conditioning. Leave blank unless you use `affinity` as a `property goal`.
+**Decoding temperature**: The temperature parameter in the SMILES/SELFIES decoder. Higher values lead to more explorative choices, smaller values culminate in mode collapse.
+**Maximal sequence length**: The maximal number of SMILES tokens in the generated molecule.
+**Number of samples**: How many samples should be generated (between 1 and 50).
+**Limit**: Hypercube limits in the latent space.
+**Number of steps**: Number of steps for a GP optmization round. The longer the slower. Has to be at least `Number of initial points`.
+**Number of initial points**: Number of initial points evaluated. The longer the slower.
+**Number of optimization rounds**: Maximum number of optimization rounds.
+**Sampling variance**: Variance of the Gaussian noise applied during sampling from the optimal point.
+**Samples for evaluation**: Number of samples averaged for each minimization function evaluation.
+**Max. sampling steps**: Maximum number of sampling steps in an optmization round.
+**Seed**: The random seed used for initialization.
+# Model card -- PaccMannGP
+**Model Details**: [PaccMann<sup>GP</sup>](https://github.com/PaccMann/paccmann_gp) is a language-based Variational Autoencoder that is coupled with a GaussianProcess for controlled sampling. This model systematically explores the latent space of a trained molecular VAE.
+**Developers**: Jannis Born, Matteo Manica and colleagues from IBM Research.
+**Distributors**: Original authors' code wrapped and distributed by GT4SD Team (2023) from IBM Research.
+**Model date**: Published in 2022.
+**Model version**: A molecular VAE trained on 1.5M molecules from ChEMBL.
+**Model type**: A language-based molecular generative model that can be explored with Gaussian Processes to generate molecules with desired properties.
+**Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
+Described in the [original paper](https://pubs.acs.org/doi/10.1021/acs.jcim.1c00889).
+**Paper or other resource for more information**:
+[Active Site Sequence Representations of Human Kinases Outperform Full Sequence Representations for Affinity Prediction and Inhibitor Generation: 3D Effects in a 1D Model (2022; *Journal of Chemical Information & Modeling*)](https://pubs.acs.org/doi/10.1021/acs.jcim.1c00889).
+**License**: MIT
+**Where to send questions or comments about the model**: Open an issue on [GT4SD repository](https://github.com/GT4SD/gt4sd-core).
+**Intended Use. Use cases that were envisioned during development**: Chemical research, in particular drug discovery.
+**Primary intended uses/users**: Researchers and computational chemists using the model for model comparison or research exploration purposes.
+**Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
+**Factors**: Not applicable.
+**Metrics**: High reward on generating molecules with desired properties.
+**Datasets**: ChEMBL.
+**Ethical Considerations**: Unclear, please consult with original authors in case of questions.
+**Caveats and Recommendations**: Unclear, please consult with original authors in case of questions.
+Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
+## Citation
+```bib
+@article{born2022active,
+	author = {Born, Jannis and Huynh, Tien and Stroobants, Astrid and Cornell, Wendy D. and Manica, Matteo},
+	title = {Active Site Sequence Representations of Human Kinases Outperform Full Sequence Representations for Affinity Prediction and Inhibitor Generation: 3D Effects in a 1D Model},
+	journal = {Journal of Chemical Information and Modeling},
+	volume = {62},
+	number = {2},
+	pages = {240-257},
+	year = {2022},
+	doi = {10.1021/acs.jcim.1c00889},
+	note ={PMID: 34905358},
+	URL = {https://doi.org/10.1021/acs.jcim.1c00889}
+}
+```

model_cards/description.md ADDED Viewed

	@@ -0,0 +1,6 @@

+<img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
+[PaccMann<sup>GP</sup>](https://github.com/PaccMann/paccmann_gp) is a language-based Variational Autoencoder that is coupled with a GaussianProcess for controlled sampling. For details of the methodology, please see [Born et al., (2022), *Journal of Chemical Information & Modeling*](https://pubs.acs.org/doi/10.1021/acs.jcim.1c00889).
+For **examples** and **documentation** of the model parameters, please see below.
+Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.

model_cards/examples.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+v0|["qed"]||1.2|100|10|4|8|4|1|0.1|3|4|42
+v0|["qed","sa"]||1.2|100|10|4|8|4|1|0.1|3|4|42
+v0|["affinity"]|MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTT|1.2|100|10|4|8|4|1|0.1|3|4|42

requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+-f https://download.pytorch.org/whl/cpu/torch_stable.html
+-f https://data.pyg.org/whl/torch-1.12.1+cpu.html
+# pip==20.2.4
+torch==1.12.1
+torch-scatter
+torch-spline-conv
+torch-sparse
+torch-geometric
+torchvision==0.13.1
+torchaudio==0.12.1
+gt4sd>=1.1.6
+diffusers==0.6.0
+molgx>=0.22.0a1
+molecule_generation
+nglview
+PyTDC==0.3.7
+gradio==3.12.0
+markdown-it-py>=2.1.0
+mols2grid>=0.2.0
+numpy==1.23.5
+pandas>=1.0.0
+terminator @ git+https://github.com/IBM/regression-transformer@gt4sd
+guacamol_baselines @ git+https://github.com/GT4SD/guacamol_baselines.git@v0.0.2
+moses @ git+https://github.com/GT4SD/moses.git@v0.1.0
+paccmann_chemistry @ git+https://github.com/PaccMann/paccmann_chemistry@0.0.4
+paccmann_generator @ git+https://github.com/PaccMann/paccmann_generator@0.0.2
+paccmann_gp @ git+https://github.com/PaccMann/paccmann_gp@0.1.1
+paccmann_omics @ git+https://github.com/PaccMann/paccmann_omics@0.0.1.1
+paccmann_predictor @ git+https://github.com/PaccMann/paccmann_predictor@sarscov2
+reinvent_models @ git+https://github.com/GT4SD/reinvent_models@v0.0.1

utils.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import logging
+from collections import defaultdict
+from typing import List, Callable
+from gt4sd.properties import PropertyPredictorRegistry
+from gt4sd.algorithms.prediction.paccmann.core import PaccMann, AffinityPredictor
+import torch
+import mols2grid
+import pandas as pd
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+def get_affinity_function(target: str) -> Callable:
+    return lambda mols: torch.stack(
+        list(
+            PaccMann(
+                AffinityPredictor(protein_targets=[target] * len(mols), ligands=mols)
+            ).sample(len(mols))
+        )
+    ).tolist()
+EVAL_DICT = {
+    "qed": PropertyPredictorRegistry.get_property_predictor("qed"),
+    "sa": PropertyPredictorRegistry.get_property_predictor("sas"),
+}
+def draw_grid_generate(
+    samples: List[str],
+    properties: List[str],
+    protein_target: str,
+    n_cols: int = 3,
+    size=(140, 200),
+) -> str:
+    """
+    Uses mols2grid to draw a HTML grid for the generated molecules
+    Args:
+        samples: The generated samples.
+        n_cols: Number of columns in grid. Defaults to 5.
+        size: Size of molecule in grid. Defaults to (140, 200).
+    Returns:
+        HTML to display
+    """
+    if protein_target != "":
+        EVAL_DICT.update({"affinity": get_affinity_function(protein_target)})
+    result = defaultdict(list)
+    result.update(
+        {"SMILES": samples, "Name": [f"Generated_{i}" for i in range(len(samples))]},
+    )
+    if "affinity" in properties:
+        properties.remove("affinity")
+        vals = EVAL_DICT["affinity"](samples)
+        result["affinity"] = vals
+    # Fill properties
+    for sample in samples:
+        for prop in properties:
+            value = EVAL_DICT[prop](sample)
+            result[prop].append(f"{prop} = {value}")
+    result_df = pd.DataFrame(result)
+    obj = mols2grid.display(
+        result_df,
+        tooltip=list(result.keys()),
+        height=1100,
+        n_cols=n_cols,
+        name="Results",
+        size=size,
+    )
+    return obj.data