safoinme commited on
Commit
5cae627
1 Parent(s): df5df0c

Upload folder using huggingface_hub

Browse files
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ WORKDIR /code
7
+
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
11
+
12
+ # Set up a new user named "user" with user ID 1000
13
+ RUN useradd -m -u 1000 user
14
+ # Switch to the "user" user
15
+ USER user
16
+ # Set home to the user's home directory
17
+ ENV HOME=/home/user \
18
+ PATH=/home/user/.local/bin:$PATH
19
+
20
+ # Set the working directory to the user's home directory
21
+ WORKDIR $HOME/app
22
+
23
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
24
+ COPY --chown=user . $HOME/app
25
+
26
+ CMD ["python", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]
__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Apache Software License 2.0
2
+ #
3
+ # Copyright (c) ZenML GmbH 2023. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Apache Software License 2.0
2
+ #
3
+ # Copyright (c) ZenML GmbH 2023. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ import click
19
+ import numpy as np
20
+ import os
21
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
22
+ from os.path import dirname
23
+
24
+ import gradio as gr
25
+ from zenml.logger import get_logger
26
+
27
+ # Initialize logger
28
+ logger = get_logger(__name__)
29
+
30
+ @click.command()
31
+ @click.option("--tokenizer_name_or_path", default="tokenizer", help="Name or the path of the tokenizer.")
32
+ @click.option("--model_name_or_path", default="model", help="Name or the path of the model.")
33
+ @click.option(
34
+ "--labels", default="Negative,Positive", help="Comma-separated list of labels."
35
+ )
36
+ @click.option(
37
+ "--title", default="ZenML NLP Use-Case", help="Title of the Gradio interface."
38
+ )
39
+ @click.option(
40
+ "--description",
41
+ default="Sentiment Analyzer",
42
+ help="Description of the Gradio interface.",
43
+ )
44
+ @click.option(
45
+ "--interpretation",
46
+ default="default",
47
+ help="Interpretation mode for the Gradio interface.",
48
+ )
49
+ @click.option(
50
+ "--examples",
51
+ default="This is an awesome journey, I love it!",
52
+ help="Comma-separated list of examples to show in the Gradio interface.",
53
+ )
54
+ def sentiment_analysis(
55
+ tokenizer_name_or_path, model_name_or_path, labels, title, description, interpretation, examples
56
+ ):
57
+ labels = labels.split(",")
58
+ examples = [examples]
59
+
60
+ def preprocess(text):
61
+ new_text = []
62
+ for t in text.split(" "):
63
+ t = "@user" if t.startswith("@") and len(t) > 1 else t
64
+ t = "http" if t.startswith("http") else t
65
+ new_text.append(t)
66
+ return " ".join(new_text)
67
+
68
+ def softmax(x):
69
+ e_x = np.exp(x - np.max(x))
70
+ return e_x / e_x.sum(axis=0)
71
+
72
+ def analyze_text(text):
73
+ model_path = f"{dirname(__file__)}/{tokenizer_name_or_path}/"
74
+ logger.info(f"Loading model from {model_path}")
75
+ tokenizer_path = f"{dirname(__file__)}/{model_name_or_path}/"
76
+ logger.info(f"Loading tokenizer from {tokenizer_path}")
77
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
78
+ model = AutoModelForSequenceClassification.from_pretrained(model_path)
79
+
80
+ text = preprocess(text)
81
+ encoded_input = tokenizer(text, return_tensors="pt")
82
+ output = model(**encoded_input)
83
+ scores_ = output[0][0].detach().numpy()
84
+ scores_ = softmax(scores_)
85
+
86
+ scores = {l: float(s) for (l, s) in zip(labels, scores_)}
87
+ return scores
88
+
89
+ demo = gr.Interface(
90
+ fn=analyze_text,
91
+ inputs=[gr.TextArea("Write your text or tweet here", label="Analyze Text")],
92
+ outputs=["label"],
93
+ title=title,
94
+ description=description,
95
+ interpretation=interpretation,
96
+ examples=examples,
97
+ )
98
+
99
+ demo.launch(share=True, debug=True)
100
+
101
+
102
+ if __name__ == "__main__":
103
+ sentiment_analysis()
model/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/Users/safoine-zenml/Library/Application Support/zenml/local_stores/7f2168ca-a26f-456a-ad57-df9f92cd8d69/mlruns/321726471800252444/bb06b501b2ec4d0d9d4b03d2403b2886/artifacts/nlp_use_case_model/model",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "problem_type": "single_label_classification",
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.34.1",
25
+ "type_vocab_size": 1,
26
+ "use_cache": true,
27
+ "vocab_size": 50265
28
+ }
model/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:254eea426065025dfb31cb3823d55a8881fed3fa838ed9f02214c19215a8ef26
3
+ size 498655278
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ nltk
2
+ torch
3
+ torchvision
4
+ torchaudio
5
+ gradio
6
+ datasets==2.12.0
7
+ numpy==1.22.4
8
+ pandas==1.5.3
9
+ session_info==1.0.0
10
+ scikit-learn==1.2.2
11
+ transformers==4.28.1
12
+ IPython==7.34.0
serve.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Task name (optional), used for display purposes.
2
+ name: nlp_use_case
3
+
4
+ # Working directory (optional), synced to ~/sky_workdir on the remote cluster
5
+ # each time launch or exec is run with the yaml file.
6
+ #
7
+ # Commands in "setup" and "run" will be executed under it.
8
+ #
9
+ # If a .gitignore file (or a .git/info/exclude file) exists in the working
10
+ # directory, files and directories listed in it will be excluded from syncing.
11
+ workdir: ./gradio
12
+
13
+ setup: |
14
+ echo "Begin setup."
15
+ pip install -r requirements.txt
16
+ echo "Setup complete."
17
+
18
+ run: |
19
+ conda activate vllm
20
+ echo 'Starting vllm api server...'
21
+ python -u -m app.py \
22
+ ----tokenizer_name $MODEL_NAME \
23
+ --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
24
+ --tokenizer hf-internal-testing/llama-tokenizer 2>&1 | tee api_server.log &
25
+ echo 'Waiting for vllm api server to start...'
26
+ while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
27
+ echo 'Starting gradio server...'
28
+ python vllm/examples/gradio_webserver.py
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "do_lower_case": true,
49
+ "eos_token": "</s>",
50
+ "errors": "replace",
51
+ "mask_token": "<mask>",
52
+ "model_max_length": 512,
53
+ "pad_token": "<pad>",
54
+ "sep_token": "</s>",
55
+ "tokenizer_class": "RobertaTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": "<unk>"
58
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff