Spaces:

GT4SD
/

regression_transformer

Running

File size: 6,268 Bytes

import logging
import pathlib

import gradio as gr
import pandas as pd
from gt4sd.algorithms.conditional_generation.regression_transformer import (
    RegressionTransformer,
)
from gt4sd.algorithms.registry import ApplicationsRegistry
from terminator.tokenization import PolymerGraphTokenizer
from utils import (
    draw_grid_generate,
    draw_grid_predict,
    get_application,
    get_inference_dict,
    get_rt_name,
)

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


def regression_transformer(
    algorithm: str,
    task: str,
    target: str,
    number_of_samples: int,
    search: str,
    temperature: float,
    tolerance: int,
    wrapper: bool,
    fraction_to_mask: float,
    property_goal: str,
    tokens_to_mask: str,
    substructures_to_mask: str,
    substructures_to_keep: str,
):

    if task == "Predict" and wrapper:
        logger.warning(
            f"For prediction, no sampling_wrapper will be used, ignoring: fraction_to_mask: {fraction_to_mask}, "
            f"tokens_to_mask: {tokens_to_mask}, substructures_to_mask={substructures_to_mask}, "
            f"substructures_to_keep: {substructures_to_keep}."
        )
        sampling_wrapper = {}
    elif not wrapper:
        sampling_wrapper = {}
    else:
        substructures_to_mask = (
            []
            if substructures_to_mask == ""
            else substructures_to_mask.replace(" ", "").split(",")
        )
        substructures_to_keep = (
            []
            if substructures_to_keep == ""
            else substructures_to_keep.replace(" ", "").split(",")
        )
        tokens_to_mask = [] if tokens_to_mask == "" else tokens_to_mask.split(",")

        property_goals = {}
        if property_goal == "":
            raise ValueError(
                "For conditional generation you have to specify `property_goal`."
            )
        for line in property_goal.split(","):
            property_goals[line.split(":")[0].strip()] = float(line.split(":")[1])

        sampling_wrapper = {
            "substructures_to_keep": substructures_to_keep,
            "substructures_to_mask": substructures_to_mask,
            "text_filtering": False,
            "fraction_to_mask": fraction_to_mask,
            "property_goal": property_goals,
        }
    algorithm_application = get_application(algorithm.split(":")[0])
    algorithm_version = algorithm.split(" ")[-1].lower()
    config = algorithm_application(
        algorithm_version=algorithm_version,
        search=search.lower(),
        temperature=temperature,
        tolerance=tolerance,
        sampling_wrapper=sampling_wrapper,
    )
    model = RegressionTransformer(configuration=config, target=target)
    samples = list(model.sample(number_of_samples))
    polymer = isinstance(
        config.generator.tokenizer.text_tokenizer, PolymerGraphTokenizer
    )
    if algorithm_version == "rop_catalyst" and task == "Generate":
        correct_samples = [(s, p) for s, p in samples if "." in s]
        while len(correct_samples) < number_of_samples:
            samples = list(model.sample(number_of_samples))
            correct_samples.extend(
                [
                    (s, p)
                    for s, p in samples
                    if "." in s and (s, p) not in correct_samples
                ]
            )
        samples = correct_samples

    if task == "Predict":
        return draw_grid_predict(samples[0], target, domain=algorithm.split(":")[0])
    else:
        return draw_grid_generate(samples, domain=algorithm.split(":")[0])


if __name__ == "__main__":

    # Preparation (retrieve all available algorithms)
    all_algos = ApplicationsRegistry.list_available()
    rt_algos = list(
        filter(lambda x: "RegressionTransformer" in x["algorithm_name"], all_algos)
    )
    rt_names = list(map(get_rt_name, rt_algos))

    properties = {}
    for algo in rt_algos:
        application = get_application(
            algo["algorithm_application"].split("Transformer")[-1]
        )
        data = get_inference_dict(
            application=application, algorithm_version=algo["algorithm_version"]
        )
        properties[get_rt_name(algo)] = data
    properties

    # Load metadata
    metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")

    examples = pd.read_csv(
        metadata_root.joinpath("regression_transformer_examples.csv"), header=None
    ).fillna("")

    with open(metadata_root.joinpath("regression_transformer_article.md"), "r") as f:
        article = f.read()
    with open(
        metadata_root.joinpath("regression_transformer_description.md"), "r"
    ) as f:
        description = f.read()

    demo = gr.Interface(
        fn=regression_transformer,
        title="Regression Transformer",
        inputs=[
            gr.Dropdown(rt_names, label="Algorithm version", value="Molecules: Qed"),
            gr.Radio(choices=["Predict", "Generate"], label="Task", value="Generate"),
            gr.Textbox(
                label="Input", placeholder="CC(C#C)N(C)C(=O)NC1=CC=C(Cl)C=C1", lines=1
            ),
            gr.Slider(
                minimum=1, maximum=50, value=10, label="Number of samples", step=1
            ),
            gr.Radio(choices=["Sample", "Greedy"], label="Search", value="Sample"),
            gr.Slider(minimum=0.5, maximum=2, value=1, label="Decoding temperature"),
            gr.Slider(minimum=5, maximum=100, value=30, label="Tolerance", step=1),
            gr.Radio(choices=[True, False], label="Sampling Wrapper", value=True),
            gr.Slider(minimum=0, maximum=1, value=0.5, label="Fraction to mask"),
            gr.Textbox(label="Property goal", placeholder="<qed>:0.75", lines=1),
            gr.Textbox(label="Tokens to mask", placeholder="N, C", lines=1),
            gr.Textbox(
                label="Substructures to mask", placeholder="C(=O), C#C", lines=1
            ),
            gr.Textbox(
                label="Substructures to keep", placeholder="C1=CC=C(Cl)C=C1", lines=1
            ),
        ],
        outputs=gr.HTML(label="Output"),
        article=article,
        description=description,
        examples=examples.values.tolist(),
    )
    demo.launch(debug=True, show_error=True)