multitask-text-and-chemistry-t5

Running

App Files Files Community

christofid

jannisborn commited on Feb 17, 2023

Commit

2e605bf

0 Parent(s):

Duplicate from GT4SD/hf-transformers

Browse files

Co-authored-by: Jannis Born <jannisborn@users.noreply.huggingface.co>

Files changed (10) hide show

.gitattributes +34 -0
.gitignore +1 -0
LICENSE +21 -0
README.md +15 -0
app.py +114 -0
model_cards/article.md +78 -0
model_cards/description.md +6 -0
model_cards/examples.csv +2 -0
requirements.txt +30 -0
utils.py +48 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 Generative Toolkit 4 Scientific Discovery
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+---
+title: HF Transformers
+emoji: 💡
+colorFrom: green
+colorTo: blue
+sdk: gradio
+sdk_version: 3.9.1
+app_file: app.py
+pinned: false
+python_version: 3.8.13
+pypi_version: 20.2.4
+duplicated_from: GT4SD/hf-transformers
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import logging
+import pathlib
+import gradio as gr
+import pandas as pd
+from gt4sd.algorithms.generation.hugging_face import (
+    HuggingFaceCTRLGenerator,
+    HuggingFaceGenerationAlgorithm,
+    HuggingFaceGPT2Generator,
+    HuggingFaceTransfoXLGenerator,
+    HuggingFaceOpenAIGPTGenerator,
+    HuggingFaceXLMGenerator,
+    HuggingFaceXLNetGenerator,
+)
+from gt4sd.algorithms.registry import ApplicationsRegistry
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+MODEL_FN = {
+    "HuggingFaceCTRLGenerator": HuggingFaceCTRLGenerator,
+    "HuggingFaceGPT2Generator": HuggingFaceGPT2Generator,
+    "HuggingFaceTransfoXLGenerator": HuggingFaceTransfoXLGenerator,
+    "HuggingFaceOpenAIGPTGenerator": HuggingFaceOpenAIGPTGenerator,
+    "HuggingFaceXLMGenerator": HuggingFaceXLMGenerator,
+    "HuggingFaceXLNetGenerator": HuggingFaceXLNetGenerator,
+}
+def run_inference(
+    model_type: str,
+    prompt: str,
+    length: float,
+    temperature: float,
+    prefix: str,
+    k: float,
+    p: float,
+    repetition_penalty: float,
+):
+    model = model_type.split("_")[0]
+    version = model_type.split("_")[1]
+    if model not in MODEL_FN.keys():
+        raise ValueError(f"Model type {model} not supported")
+    config = MODEL_FN[model](
+        algorithm_version=version,
+        prompt=prompt,
+        length=length,
+        temperature=temperature,
+        repetition_penalty=repetition_penalty,
+        k=k,
+        p=p,
+        prefix=prefix,
+    )
+    model = HuggingFaceGenerationAlgorithm(config)
+    text = list(model.sample(1))[0]
+    return text
+if __name__ == "__main__":
+    # Preparation (retrieve all available algorithms)
+    all_algos = ApplicationsRegistry.list_available()
+    algos = [
+        x["algorithm_application"] + "_" + x["algorithm_version"]
+        for x in list(filter(lambda x: "HuggingFace" in x["algorithm_name"], all_algos))
+    ]
+    # Load metadata
+    metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
+    examples = pd.read_csv(metadata_root.joinpath("examples.csv"), header=None).fillna(
+        ""
+    )
+    print("Examples: ", examples.values.tolist())
+    with open(metadata_root.joinpath("article.md"), "r") as f:
+        article = f.read()
+    with open(metadata_root.joinpath("description.md"), "r") as f:
+        description = f.read()
+    demo = gr.Interface(
+        fn=run_inference,
+        title="HuggingFace language models",
+        inputs=[
+            gr.Dropdown(
+                algos,
+                label="Language model",
+                value="HuggingFaceGPT2Generator_gpt2",
+            ),
+            gr.Textbox(
+                label="Text prompt",
+                placeholder="I'm a stochastic parrot.",
+                lines=1,
+            ),
+            gr.Slider(minimum=5, maximum=100, value=20, label="Maximal length", step=1),
+            gr.Slider(
+                minimum=0.6, maximum=1.5, value=1.1, label="Decoding temperature"
+            ),
+            gr.Textbox(
+                label="Prefix", placeholder="Some prefix (before the prompt)", lines=1
+            ),
+            gr.Slider(minimum=2, maximum=500, value=50, label="Top-k", step=1),
+            gr.Slider(minimum=0.5, maximum=1, value=1.0, label="Decoding-p", step=1),
+            gr.Slider(minimum=0.5, maximum=5, value=1.0, label="Repetition penalty"),
+        ],
+        outputs=gr.Textbox(label="Output"),
+        article=article,
+        description=description,
+        examples=examples.values.tolist(),
+    )
+    demo.launch(debug=True, show_error=True)

model_cards/article.md ADDED Viewed

	@@ -0,0 +1,78 @@

+# Model documentation & parameters
+**Language model**: Type of language model to be used.
+**Text prompt**: The text prompt to condition the model.
+**Maximal length**: The maximal number of SMILES tokens in the generated molecule.
+**Decoding temperature**: The temperature in the beam search decoding.
+**Prefix**: A text prompt that will be passed to the mode **before** the prompt.
+**Top-k**: Number of top-k probability tokens to keep.
+**Decoding-p**: Only tokens with cumulative probabilities summing up to this value are kept.
+**Repetition penalty**: Penalty for repeating tokens. Leave unchanged, but for CTRL model, use 1.2.
+# Model card -- HuggingFace
+**Model Details**: Various Transformer-based language models.
+**Developers**: HuggingFace developers
+**Distributors**: HuggingFace developers' code integrated into GT4SD.
+**Model date**: Varies between models.
+**Model type**: Different types of `transformers` language models:
+- CTRL: `CTRLLMHeadModel`
+- GPT2: `GPT2LMHeadModel`
+- XLNet: `XLNetLMHeadModel`
+- OpenAIGPT: `OpenAIGPTLMHeadModel`
+- TransfoXL: `TransfoXLLMHeadModel`
+- XLM: `XLMWithLMHeadModel`
+**Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
+N.A.
+**Paper or other resource for more information**:
+All documentation available from [transformers documentation](https://huggingface.co/docs/transformers/)
+**License**: MIT
+**Where to send questions or comments about the model**: Open an issue on [GT4SD repository](https://github.com/GT4SD/gt4sd-core).
+**Intended Use. Use cases that were envisioned during development**: N.A.
+**Primary intended uses/users**: N.A.
+**Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
+**Metrics**: N.A.
+**Datasets**: N.A.
+**Ethical Considerations**: Unclear, please consult with original authors in case of questions.
+**Caveats and Recommendations**: Unclear, please consult with original authors in case of questions.
+Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
+## Citation
+```bib
+@inproceedings{wolf-etal-2020-transformers,
+    title = "Transformers: State-of-the-Art Natural Language Processing",
+    author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = oct,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
+    pages = "38--45"
+}
+```

model_cards/description.md ADDED Viewed

	@@ -0,0 +1,6 @@

+<img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
+This UI gives access to some pretrained language models from [*HuggingFace*](https://github.com/huggingface/) that are distributed via GT4SD.
+For **examples** and **documentation** of the model parameters, please see below.
+Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.

model_cards/examples.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ HuggingFaceGPT2Generator_gpt2, The role of generative models is,20,1.1,,50,1,1
2	+ HuggingFaceOpenAIGPTGenerator_openai-gpt, The best country in the world is,10,0.9,,50,1,1

requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+-f https://download.pytorch.org/whl/cpu/torch_stable.html
+-f https://data.pyg.org/whl/torch-1.12.1+cpu.html
+# pip==20.2.4
+torch==1.12.1
+torch-scatter
+torch-spline-conv
+torch-sparse
+torch-geometric
+torchvision==0.13.1
+torchaudio==0.12.1
+gt4sd>=1.1.6
+diffusers==0.6.0
+molgx>=0.22.0a1
+molecule_generation
+nglview
+PyTDC==0.3.7
+gradio==3.12.0
+markdown-it-py>=2.1.0
+mols2grid>=0.2.0
+numpy==1.23.5
+pandas>=1.0.0
+terminator @ git+https://github.com/IBM/regression-transformer@gt4sd
+guacamol_baselines @ git+https://github.com/GT4SD/guacamol_baselines.git@v0.0.2
+moses @ git+https://github.com/GT4SD/moses.git@v0.1.0
+paccmann_chemistry @ git+https://github.com/PaccMann/paccmann_chemistry@0.0.4
+paccmann_generator @ git+https://github.com/PaccMann/paccmann_generator@0.0.2
+paccmann_gp @ git+https://github.com/PaccMann/paccmann_gp@0.1.1
+paccmann_omics @ git+https://github.com/PaccMann/paccmann_omics@0.0.1.1
+paccmann_predictor @ git+https://github.com/PaccMann/paccmann_predictor@sarscov2
+reinvent_models @ git+https://github.com/GT4SD/reinvent_models@v0.0.1

utils.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import logging
+from collections import defaultdict
+from typing import List
+import mols2grid
+import pandas as pd
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+def draw_grid_generate(
+    samples: List[str],
+    seeds: List[str] = [],
+    n_cols: int = 3,
+    size=(140, 200),
+) -> str:
+    """
+    Uses mols2grid to draw a HTML grid for the generated molecules
+    Args:
+        samples: The generated samples.
+        n_cols: Number of columns in grid. Defaults to 5.
+        size: Size of molecule in grid. Defaults to (140, 200).
+    Returns:
+        HTML to display
+    """
+    result = defaultdict(list)
+    result.update(
+        {
+            "SMILES": seeds + samples,
+            "Name": [f"Seed_{i}" for i in range(len(seeds))]
+            + [f"Generated_{i}" for i in range(len(samples))],
+        },
+    )
+    result_df = pd.DataFrame(result)
+    obj = mols2grid.display(
+        result_df,
+        tooltip=list(result.keys()),
+        height=1100,
+        n_cols=n_cols,
+        name="Results",
+        size=size,
+    )
+    return obj.data