Spaces:

safoinme
/

nlp_use_case

Sleeping

App Files Files Community

safoinme commited on Oct 26, 2023

Commit

5cae627

•

1 Parent(s): df5df0c

Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

Dockerfile +26 -0
__init__.py +16 -0
app.py +103 -0
model/config.json +28 -0
model/pytorch_model.bin +3 -0
requirements.txt +12 -0
serve.yaml +28 -0
tokenizer/merges.txt +0 -0
tokenizer/special_tokens_map.json +51 -0
tokenizer/tokenizer.json +0 -0
tokenizer/tokenizer_config.json +58 -0
tokenizer/vocab.json +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+CMD ["python", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]

__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#

app.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import click
+import numpy as np
+import os
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from os.path import dirname
+import gradio as gr
+from zenml.logger import get_logger
+# Initialize logger
+logger = get_logger(__name__)
+@click.command()
+@click.option("--tokenizer_name_or_path", default="tokenizer", help="Name or the path of the tokenizer.")
+@click.option("--model_name_or_path", default="model", help="Name or the path of the model.")
+@click.option(
+    "--labels", default="Negative,Positive", help="Comma-separated list of labels."
+)
+@click.option(
+    "--title", default="ZenML NLP Use-Case", help="Title of the Gradio interface."
+)
+@click.option(
+    "--description",
+    default="Sentiment Analyzer",
+    help="Description of the Gradio interface.",
+)
+@click.option(
+    "--interpretation",
+    default="default",
+    help="Interpretation mode for the Gradio interface.",
+)
+@click.option(
+    "--examples",
+    default="This is an awesome journey, I love it!",
+    help="Comma-separated list of examples to show in the Gradio interface.",
+)
+def sentiment_analysis(
+    tokenizer_name_or_path, model_name_or_path, labels, title, description, interpretation, examples
+):
+    labels = labels.split(",")
+    examples = [examples]
+    def preprocess(text):
+        new_text = []
+        for t in text.split(" "):
+            t = "@user" if t.startswith("@") and len(t) > 1 else t
+            t = "http" if t.startswith("http") else t
+            new_text.append(t)
+        return " ".join(new_text)
+    def softmax(x):
+        e_x = np.exp(x - np.max(x))
+        return e_x / e_x.sum(axis=0)
+    def analyze_text(text):
+        model_path = f"{dirname(__file__)}/{tokenizer_name_or_path}/"
+        logger.info(f"Loading model from {model_path}")
+        tokenizer_path = f"{dirname(__file__)}/{model_name_or_path}/"
+        logger.info(f"Loading tokenizer from {tokenizer_path}")
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+        model = AutoModelForSequenceClassification.from_pretrained(model_path)
+        text = preprocess(text)
+        encoded_input = tokenizer(text, return_tensors="pt")
+        output = model(**encoded_input)
+        scores_ = output[0][0].detach().numpy()
+        scores_ = softmax(scores_)
+        scores = {l: float(s) for (l, s) in zip(labels, scores_)}
+        return scores
+    demo = gr.Interface(
+        fn=analyze_text,
+        inputs=[gr.TextArea("Write your text or tweet here", label="Analyze Text")],
+        outputs=["label"],
+        title=title,
+        description=description,
+        interpretation=interpretation,
+        examples=examples,
+    )
+    demo.launch(share=True, debug=True)
+if __name__ == "__main__":
+    sentiment_analysis()

model/config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_name_or_path": "/Users/safoine-zenml/Library/Application Support/zenml/local_stores/7f2168ca-a26f-456a-ad57-df9f92cd8d69/mlruns/321726471800252444/bb06b501b2ec4d0d9d4b03d2403b2886/artifacts/nlp_use_case_model/model",
+  "architectures": [
+    "RobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.34.1",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

model/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:254eea426065025dfb31cb3823d55a8881fed3fa838ed9f02214c19215a8ef26
+size 498655278

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+nltk
+torch
+torchvision
+torchaudio
+gradio
+datasets==2.12.0
+numpy==1.22.4
+pandas==1.5.3
+session_info==1.0.0
+scikit-learn==1.2.2
+transformers==4.28.1
+IPython==7.34.0

serve.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+# Task name (optional), used for display purposes.
+name: nlp_use_case
+# Working directory (optional), synced to ~/sky_workdir on the remote cluster
+# each time launch or exec is run with the yaml file.
+#
+# Commands in "setup" and "run" will be executed under it.
+#
+# If a .gitignore file (or a .git/info/exclude file) exists in the working
+# directory, files and directories listed in it will be excluded from syncing.
+workdir: ./gradio
+setup: |
+  echo "Begin setup."
+  pip install -r requirements.txt
+  echo "Setup complete."
+run: |
+  conda activate vllm
+  echo 'Starting vllm api server...'
+  python -u -m app.py \
+                   ----tokenizer_name $MODEL_NAME \
+                   --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+                   --tokenizer hf-internal-testing/llama-tokenizer 2>&1 | tee api_server.log &
+  echo 'Waiting for vllm api server to start...'
+  while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
+  echo 'Starting gradio server...'
+  python vllm/examples/gradio_webserver.py

tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50264": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "do_lower_case": true,
+  "eos_token": "</s>",
+  "errors": "replace",
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff