Spaces:

GT4SD
/

regression_transformer

Running

App Files Files Community

jannisborn commited on Dec 2, 2022

Commit

8b150bd

1 Parent(s): 3a8c428

feat: Initial RT app

Browse files

Files changed (10) hide show

LICENSE +21 -0
README.md +13 -11
app.py +158 -0
model_cards/.DS_Store +0 -0
model_cards/regression_transformer.png +0 -0
model_cards/regression_transformer_article.md +59 -0
model_cards/regression_transformer_description.md +8 -0
model_cards/regression_transformer_examples.csv +7 -0
requirements.txt +5 -0
utils.py +172 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 Generative Toolkit 4 Scientific Discovery
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,14 @@
----
-title: Regression Transformer
-emoji: 😻
-colorFrom: indigo
-colorTo: red
-sdk: gradio
-sdk_version: 3.12.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# gt4sd-apps
+Web apps of GT4SD models powered via gradio.
+## Installation
+1. Install `gt4sd` from [https://github.com/GT4SD/gt4sd-core](`gt4sd-core`).
+2. Install requirements in env:
+```sh
+conda activate gt4sd
+pip install -r requirements.txt
+```
+3. Run a demo on a localhost:
+```sh
+python apps/algorithms/conditional_generation/regression_transformer/app.py
+```

app.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import logging
+import pathlib
+import gradio as gr
+import pandas as pd
+from gt4sd.algorithms.conditional_generation.regression_transformer import (
+    RegressionTransformer,
+)
+from gt4sd.algorithms.registry import ApplicationsRegistry
+from utils import (
+    draw_grid_generate,
+    draw_grid_predict,
+    get_application,
+    get_inference_dict,
+    get_rt_name,
+)
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+def regression_transformer(
+    algorithm: str,
+    task: str,
+    target: str,
+    number_of_samples: int,
+    search: str,
+    temperature: float,
+    tolerance: int,
+    wrapper: bool,
+    fraction_to_mask: float,
+    property_goal: str,
+    tokens_to_mask: str,
+    substructures_to_mask: str,
+    substructures_to_keep: str,
+):
+    if task == "Predict" and wrapper:
+        logger.warning(
+            f"For prediction, no sampling_wrapper will be used, ignoring: fraction_to_mask: {fraction_to_mask}, "
+            f"tokens_to_mask: {tokens_to_mask}, substructures_to_mask={substructures_to_mask}, "
+            f"substructures_to_keep: {substructures_to_keep}."
+        )
+        sampling_wrapper = {}
+    elif not wrapper:
+        sampling_wrapper = {}
+    else:
+        substructures_to_mask = (
+            []
+            if substructures_to_mask == ""
+            else substructures_to_mask.replace(" ", "").split(",")
+        )
+        substructures_to_keep = (
+            []
+            if substructures_to_keep == ""
+            else substructures_to_keep.replace(" ", "").split(",")
+        )
+        tokens_to_mask = [] if tokens_to_mask == "" else tokens_to_mask.split(",")
+        property_goals = {}
+        if property_goal == "":
+            raise ValueError(
+                "For conditional generation you have to specify `property_goal`."
+            )
+        for line in property_goal.split(","):
+            property_goals[line.split(":")[0].strip()] = float(line.split(":")[1])
+        sampling_wrapper = {
+            "substructures_to_keep": substructures_to_keep,
+            "substructures_to_mask": substructures_to_mask,
+            "text_filtering": False,
+            "fraction_to_mask": fraction_to_mask,
+            "property_goal": property_goals,
+        }
+    algorithm_application = get_application(algorithm.split(":")[0])
+    algorithm_version = algorithm.split(" ")[-1].lower()
+    config = algorithm_application(
+        algorithm_version=algorithm_version,
+        search=search.lower(),
+        temperature=temperature,
+        tolerance=tolerance,
+        sampling_wrapper=sampling_wrapper,
+    )
+    model = RegressionTransformer(configuration=config, target=target)
+    samples = list(model.sample(number_of_samples))
+    if task == "Predict":
+        return draw_grid_predict(samples[0], target, domain=algorithm.split(":")[0])
+    else:
+        return draw_grid_generate(samples, domain=algorithm.split(":")[0])
+if __name__ == "__main__":
+    # Preparation (retrieve all available algorithms)
+    all_algos = ApplicationsRegistry.list_available()
+    rt_algos = list(
+        filter(lambda x: "RegressionTransformer" in x["algorithm_name"], all_algos)
+    )
+    rt_names = list(map(get_rt_name, rt_algos))
+    properties = {}
+    for algo in rt_algos:
+        application = get_application(
+            algo["algorithm_application"].split("Transformer")[-1]
+        )
+        data = get_inference_dict(
+            application=application, algorithm_version=algo["algorithm_version"]
+        )
+        properties[get_rt_name(algo)] = data
+    properties
+    # Load metadata
+    metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
+    examples = pd.read_csv(
+        metadata_root.joinpath("regression_transformer_examples.csv"), header=None
+    ).fillna("")
+    with open(metadata_root.joinpath("regression_transformer_article.md"), "r") as f:
+        article = f.read()
+    with open(
+        metadata_root.joinpath("regression_transformer_description.md"), "r"
+    ) as f:
+        description = f.read()
+    demo = gr.Interface(
+        fn=regression_transformer,
+        title="Regression Transformer",
+        inputs=[
+            gr.Dropdown(rt_names, label="Algorithm version", value="Molecules: Qed"),
+            gr.Radio(choices=["Predict", "Generate"], label="Task", value="Generate"),
+            gr.Textbox(
+                label="Input", placeholder="CC(C#C)N(C)C(=O)NC1=CC=C(Cl)C=C1", lines=1
+            ),
+            gr.Slider(
+                minimum=1, maximum=50, value=10, label="Number of samples", step=1
+            ),
+            gr.Radio(choices=["Sample", "Greedy"], label="Search", value="Sample"),
+            gr.Slider(minimum=0.5, maximum=2, value=1, label="Decoding temperature"),
+            gr.Slider(minimum=5, maximum=100, value=30, label="Tolerance", step=1),
+            gr.Radio(choices=[True, False], label="Sampling Wrapper", value=True),
+            gr.Slider(minimum=0, maximum=1, value=0.5, label="Fraction to mask"),
+            gr.Textbox(label="Property goal", placeholder="<qed>:0.75", lines=1),
+            gr.Textbox(label="Tokens to mask", placeholder="N, C", lines=1),
+            gr.Textbox(
+                label="Substructures to mask", placeholder="C(=O), C#C", lines=1
+            ),
+            gr.Textbox(
+                label="Substructures to keep", placeholder="C1=CC=C(Cl)C=C1", lines=1
+            ),
+        ],
+        outputs=gr.HTML(label="Output"),
+        article=article,
+        description=description,
+        examples=examples.values.tolist(),
+    )
+    demo.launch(debug=True, show_error=True)

model_cards/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

model_cards/regression_transformer.png ADDED Viewed

model_cards/regression_transformer_article.md ADDED Viewed

	@@ -0,0 +1,59 @@

+# Model card -- Regression Transformer
+## Parameters
+### Algorithm Version:
+Which model checkpoint to use (trained on different datasets).
+### Task
+Whether the multitask model should be used for property prediction or conditional generation (default).
+### Input
+The input sequence. In the default setting (where `Task` is *Generate* and `Sampling Wrapper` is *True*) this can be a seed SMILES (for the molecule models) or amino-acid sequence (for the protein models). The model will locally adapt the seed sequence by masking `Fraction to mask` of the tokens.
+If the `Task` is *Predict*, the sequences are given as SELFIES for the molecule models. Moreover, the tokens that should be predicted (`[MASK]` in the input) have to be given explicitly. Populate the examples to understand better.
+NOTE: When setting `Task` to *Generate*, and `Sampling Wrapper` to *False*, the user has maximal control about the generative process and can explicitly decide which tokens should be masked.
+### Number of samples
+How many samples should be generated (between 1 and 50). If `Task` is *Predict*, this has to be set to 1.
+### Search
+Decoding search method. Use *Sample* if `Task` is *Generate*. If `Task` is *Predict*, use *Greedy*.
+### Tolerance
+Precision tolerance; only used if `Task` is *Generate*. This is a single float between 0 and 100 for the the tolerated deviation between desired/primed property and predicted property of the generated molecule. Given in percentage with respect to the property range encountered during training.
+NOTE: The tolerance is *only* used for post-hoc filtering of the generated samples.
+### Sampling Wrapper
+Only used if `Task` is *Generate*. If set to *False*, the user has to provide a full RT-sequence as `Input` and has to **explicitly** decide which tokens are masked (see example below). This gives full control but is tedious. Instead, if `Sampling Wrapper` is set to *True*, the RT stochastically determines which parts of the sequence are masked.
+**NOTE**: All below arguments only apply if `Sampling Wrapper` is *True*.
+#### Fraction to mask
+Specifies the ratio of tokens that can be changed by the model. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
+#### Property goal
+Specifies the desired target properties for the generation. Need to be given in the format `<prop>:value`. If the model supports multiple properties, give them separated by a comma `,`. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
+#### Tokens to mask
+Optionally specifies which tokens (atoms, bonds etc) can be masked. Please separate multiple tokens by comma (`,`). If not specified, all tokens can be masked. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
+#### Substructures to mask
+Optionally specifies a list of substructures that should *definitely* be masked (excluded from stochastic masking). Given in SMILES format. If multiple are provided, separate by comma (`,`). Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
+*NOTE*: Most models operate on SELFIES and the matching of the substructures occurs in SELFIES simply on a string level.
+#### Substructures to keep
+Optionally specifies a list of substructures that should definitely be present in the target sample (i.e., excluded from stochastic masking). Given in SMILES format. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
+*NOTE*: This keeps tokens even if they are included in `tokens_to_mask`.
+*NOTE*: Most models operate on SELFIES and the matching of the substructures occurs in SELFIES simply on a string level.
+## Citation
+```bib
+@article{born2022regression,
+  title={Regression Transformer: Concurrent Conditional Generation and Regression by Blending Numerical and Textual Tokens},
+  author={Born, Jannis and Manica, Matteo},
+  journal={arXiv preprint arXiv:2202.01338},
+  note={Spotlight talk at ICLR workshop on Machine Learning for Drug Discovery},
+  year={2022}
+}
+```

model_cards/regression_transformer_description.md ADDED Viewed

	@@ -0,0 +1,8 @@

+### Concurrent sequence regression and generation for molecular language modeling**
+The RT is a multitask Transformer that reformulates regression as a conditional sequence modeling task.
+This yields a dichotomous language model that seamlessly integrates regression with property-driven conditional generation task.
+**Further reading:** [arXiv preprint](https://arxiv.org/abs/2202.01338) and [GitHub development code](https://github.com/IBM/regression-transformer).
+Each `algorithm_version` refers to one trained model. Each model can be used for **two tasks**, either to *predict* one (or multiple) properties of a molecule or to *generate* a molecule (given a seed molecule and a property constraint).

model_cards/regression_transformer_examples.csv ADDED Viewed

	@@ -0,0 +1,7 @@

+Molecules: Logp_and_synthesizability,Generate,CCOC1=NC=NC(=C1C)NCCOC(C)C,3,Sample,1.2,20,True,0.3,"<logp>:0.390, <scs>:2.628",N,(C)C,CCO
+Molecules: Qed,Generate,CC(C#C)N(C)C(=O)NC1=CC=C(Cl)C=C1,10,Sample,1.0,30,True,0.5,<qed>:0.75,"N, C","C(=O), CC",C1=CC=C(Cl)C=C1
+Molecules: Logp_and_synthesizability,Predict,<logp>[MASK][MASK][MASK][MASK][MASK]|<scs>[MASK][MASK][MASK][MASK][MASK]|[C][C][O][C][=N][C][=N][C][Branch1_2][Branch1_1][=C][Ring1][Branch1_2][C][N][C][C][O][C][Branch1_1][C][C][C],1,Greedy,1.0,30,False,0.0,,,,
+Proteins: Stability,Predict,<stab>[MASK][MASK][MASK][MASK][MASK]|GSQEVNSGTQTYKNASPEEAERIARKAGATTWTEKGNKWEIRI,1,Greedy,1.0,1,False,0.0,,,,
+Proteins: Stability,Generate,GSQEVNSGTQTYKNASPEEAERIARKAGATTWTEKGNKWEIRI,10,Sample,1.2,30,True,0.3,<stab>:0.393,,SQEVNSGTQTYKN,WTEK
+Molecules: Qed,Generate,<qed>0.717|[MASK][MASK][MASK][MASK][MASK][C][Branch2_1][Ring1][Ring1][MASK][MASK][=C][C][Branch1_1][C][C][=N][C][MASK][MASK][=C][C][=C][Ring1][O][Ring1][Branch1_2][=C][Ring2][MASK][MASK],10,Sample,1.2,30,False,0.0,,,,
+Molecules: Solubility,Generate,ClC(Cl)C(Cl)Cl,5,Sample,1.3,40,True,0.4,<esol>:0.754,,,

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gt4sd>=1.0.0
+gradio>=3.9
+markdown-it-py>=2.1.0
+mols2grid>=0.2.0
+pandas>=1.0.0

utils.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import json
+import logging
+import os
+from collections import defaultdict
+from typing import Dict, List, Tuple
+import mols2grid
+import pandas as pd
+from gt4sd.algorithms import (
+    RegressionTransformerMolecules,
+    RegressionTransformerProteins,
+)
+from gt4sd.algorithms.core import AlgorithmConfiguration
+from rdkit import Chem
+from terminator.selfies import decoder
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+def get_application(application: str) -> AlgorithmConfiguration:
+    """
+    Convert application name to AlgorithmConfiguration.
+    Args:
+        application: Molecules or Proteins
+    Returns:
+        The corresponding AlgorithmConfiguration
+    """
+    if application == "Molecules":
+        application = RegressionTransformerMolecules
+    elif application == "Proteins":
+        application = RegressionTransformerProteins
+    else:
+        raise ValueError(
+            "Currently only models for molecules and proteins are supported"
+        )
+    return application
+def get_inference_dict(
+    application: AlgorithmConfiguration, algorithm_version: str
+) -> Dict:
+    """
+    Get inference dictionary for a given application and algorithm version.
+    Args:
+        application: algorithm application (Molecules or Proteins)
+        algorithm_version: algorithm version (e.g. qed)
+    Returns:
+        A dictionary with the inference parameters.
+    """
+    config = application(algorithm_version=algorithm_version)
+    with open(os.path.join(config.ensure_artifacts(), "inference.json"), "r") as f:
+        data = json.load(f)
+    return data
+def get_rt_name(x: Dict) -> str:
+    """
+    Get the UI display name of the regression transformer.
+    Args:
+        x: dictionary with the inference parameters
+    Returns:
+        The display name
+    """
+    return (
+        x["algorithm_application"].split("Transformer")[-1]
+        + ": "
+        + x["algorithm_version"].capitalize()
+    )
+def draw_grid_predict(prediction: str, target: str, domain: str) -> str:
+    """
+    Uses mols2grid to draw a HTML grid for the prediction
+    Args:
+        prediction: Predicted sequence.
+        target: Target molecule
+        domain: Domain of the prediction (molecules or proteins)
+    Returns:
+        HTML to display
+    """
+    if domain not in ["Molecules", "Proteins"]:
+        raise ValueError(f"Unsupported domain {domain}")
+    seq = target.split("|")[-1]
+    converter = (
+        decoder
+        if domain == "Molecules"
+        else lambda x: Chem.MolToSmiles(Chem.MolFromFASTA(x))
+    )
+    try:
+        seq = converter(seq)
+    except Exception:
+        logger.warning(f"Could not draw sequence {seq}")
+    result = {"SMILES": [seq], "Name": ["Target"]}
+    # Add properties
+    for prop in prediction.split("<")[1:]:
+        result[
+            prop.split(">")[0]
+        ] = f"{prop.split('>')[0].capitalize()} = {prop.split('>')[1]}"
+    result_df = pd.DataFrame(result)
+    obj = mols2grid.display(
+        result_df,
+        tooltip=list(result.keys()),
+        height=900,
+        n_cols=1,
+        name="Results",
+        size=(600, 700),
+    )
+    return obj.data
+def draw_grid_generate(
+    samples: List[Tuple[str]], domain: str, n_cols: int = 5, size=(140, 200)
+) -> str:
+    """
+    Uses mols2grid to draw a HTML grid for the generated molecules
+    Args:
+        samples: The generated samples (with properties)
+        domain: Domain of the prediction (molecules or proteins)
+        n_cols: Number of columns in grid. Defaults to 5.
+        size: Size of molecule in grid. Defaults to (140, 200).
+    Returns:
+        HTML to display
+    """
+    if domain not in ["Molecules", "Proteins"]:
+        raise ValueError(f"Unsupported domain {domain}")
+    if domain == "Proteins":
+        try:
+            smis = list(
+                map(lambda x: Chem.MolToSmiles(Chem.MolFromFASTA(x[0])), samples)
+            )
+        except Exception:
+            logger.warning(f"Could not convert some sequences {samples}")
+    else:
+        smis = [s[0] for s in samples]
+    result = defaultdict(list)
+    result.update({"SMILES": smis, "Name": [f"sample_{i}" for i in range(len(smis))]})
+    # Create properties
+    properties = [s.split("<")[1] for s in samples[0][1].split(">")[:-1]]
+    # Fill properties
+    for sample in samples:
+        for prop in properties:
+            value = float(sample[1].split(prop)[-1][1:].split("<")[0])
+            result[prop].append(f"{prop} = {value}")
+    result_df = pd.DataFrame(result)
+    obj = mols2grid.display(
+        result_df,
+        tooltip=list(result.keys()),
+        height=1100,
+        n_cols=n_cols,
+        name="Results",
+        size=size,
+    )
+    return obj.data