Spaces:

fxmarty
/

bettertransformer-demo

Running

App Files Files Community

Felix Marty commited on Nov 21, 2022

Commit

35e3254

•

1 Parent(s): 586c827

add demo

Browse files

Files changed (6) hide show

Dockerfile +30 -0
app.py +81 -4
backend.py +101 -0
defaults.py +38 -0
requirements.txt +1 -0
utils.py +25 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
+ENV PATH="/root/miniconda3/bin:${PATH}"
+ARG PATH="/root/miniconda3/bin:${PATH}"
+RUN apt-get update && apt-get upgrade -y
+RUN apt-get install -y wget && rm -rf /var/lib/apt/lists/*
+RUN wget \
+    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
+    && mkdir /root/.conda \
+    && bash Miniconda3-latest-Linux-x86_64.sh -b \
+    && rm -f Miniconda3-latest-Linux-x86_64.sh
+RUN conda init bash
+# install git
+RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
+    git && \
+    apt-get clean
+RUN pip install torch torchvision torchaudio
+RUN pip install torchserve torch-model-archiver torch-workflow-archiver
+RUN pip install transformers optimum
+RUN git clone https://github.com/fxmarty/bettertransformer_demo.git
+WORKDIR /workspace/bettertransformer_demo

app.py CHANGED Viewed

@@ -1,7 +1,84 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

 import gradio as gr
+from .defaults import defaults_vanilla_single, defaults_bt_spam, defaults_bt_single, defaults_vanilla_spam
+from .defaults import ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER
+from .backend import send_single, send_spam, get_message_single, get_message_spam
+with gr.Blocks() as demo:
+    gr.Markdown("""
+    Let's try out TorchServe + BetterTransformer! This is some longer description This is some longer description This is some longer description")
+    ## Inference using...
+    """
+    )
+    with gr.Row():
+        with gr.Column(scale=50):
+            gr.Markdown("### Vanilla Transformers + TorchServe")
+            address_input_vanilla = gr.Textbox(
+                max_lines=1,
+                label="ip vanilla",
+                value=ADDRESS_VANILLA,
+                visible=False
+            )
+            input_model_vanilla = gr.Textbox(
+                max_lines=1,
+                label="Text",
+                value="Expectations were low, enjoyment was high",
+            )
+            btn_single_vanilla = gr.Button("Send single text request")
+            output_single_vanilla = gr.Markdown(label="Output single vanilla", value=get_message_single(**defaults_vanilla_single))
+            btn_spam_vanilla = gr.Button("Spam text requests (from sst2 validation set)")
+            output_spam_vanilla = gr.Markdown(label="Output spam vanilla", value=get_message_spam(**defaults_vanilla_spam))
+            btn_single_vanilla.click(
+                fn=send_single,
+                inputs=[input_model_vanilla, address_input_vanilla],
+                outputs=output_single_vanilla,
+            )
+            btn_spam_vanilla.click(
+                fn=send_spam,
+                inputs=[address_input_vanilla],
+                outputs=output_spam_vanilla,
+            )
+        with gr.Column(scale=50):
+            gr.Markdown("### BetterTransformer + TorchServe")
+            address_input_bettertransformer = gr.Textbox(
+                max_lines=1,
+                label="ip bettertransformer",
+                value=ADDRESS_BETTERTRANSFORMER,
+                visible=False
+            )
+            input_model_bettertransformer = gr.Textbox(
+                max_lines=1,
+                label="Text",
+                value="Expectations were low, enjoyment was high",
+            )
+            btn_single_bt = gr.Button("Send single text request")
+            output_single_bt = gr.Markdown(label="Output single bt", value=get_message_single(**defaults_bt_single))
+            btn_spam_bt = gr.Button("Spam text requests (from sst2 validation set)")
+            output_spam_bt = gr.Markdown(label="Output spam bt", value=get_message_spam(**defaults_bt_spam))
+            btn_single_bt.click(
+                fn=send_single,
+                inputs=[input_model_bettertransformer, address_input_bettertransformer],
+                outputs=output_single_bt,
+            )
+            btn_spam_bt.click(
+                fn=send_spam,
+                inputs=[address_input_bettertransformer],
+                outputs=output_spam_bt,
+            )
+demo.queue(concurrency_count=1)
+demo.launch()

backend.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import json
+from .defaults import SPAM_N_REQUESTS, ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER, HEADERS
+from .utils import ElapsedFuturesSession
+from datasets import load_dataset
+data = load_dataset("glue", "sst2", split="validation")
+RETURN_MESSAGE_SINGLE = """
+Inference statistics:
+* Response status: {0}
+* Prediction: {1}
+* Inference latency (preprocessing/forward/postprocessing): {2} ms
+* Peak GPU memory usage: {3} MB
+* End-to-end latency (communication + pre/forward/post): {4} ms
+* Padding ratio: 0.0 %
+"""
+RETURN_MESSAGE_SPAM = """
+Processing """ + f"{SPAM_N_REQUESTS}" + """ inputs sent asynchronously. Grab a coffee.
+Inference statistics:
+* Promise resolution time: {0} ms
+* Mean inference latency (preprocessing/forward/postprocessing): {1} ms
+* Mean peak GPU memory: {2} MB
+* Mean padding ratio: {3} %
+* Mean sequence length: {4} tokens
+"""
+def get_message_single(status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs):
+    return RETURN_MESSAGE_SINGLE.format(status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency)
+def get_message_spam(resolution_time, mean_inference_latency, mean_peak_gpu_memory, mean_padding_ratio, mean_sequence_length, **kwargs):
+    return RETURN_MESSAGE_SPAM.format(resolution_time, mean_inference_latency, mean_peak_gpu_memory, mean_padding_ratio, mean_sequence_length)
+SESSION = ElapsedFuturesSession()
+def send_single(input_model_vanilla, address: str):
+    assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
+    promise = SESSION.post(address, headers=HEADERS, data=input_model_vanilla.encode("utf-8"))
+    response = promise.result()  # resolve immediately
+    status = response.status_code
+    response_text = json.loads(response.text)
+    prediction = response_text[0]
+    inf_latency = response_text[1]
+    peak_gpu_memory = response_text[2]
+    end_to_end_latency = response.elapsed
+    return get_message_single(status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency)
+def send_spam(address: str):
+    assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
+    # data = "this is positive lol"  #TODO: use dynamic data with padding
+    assert SPAM_N_REQUESTS <= len(data)
+    inp = data.shuffle().select(range(SPAM_N_REQUESTS))
+    resolution_time = 0
+    mean_inference_latency = 0
+    mean_peak_gpu_memory = 0
+    n_pads = 0
+    n_elems = 0
+    sequence_length = 0
+    promises = []
+    for i in range(SPAM_N_REQUESTS):
+        input_data = inp[i]["sentence"].encode("utf-8")
+        promises.append(SESSION.post(address, headers=HEADERS, data=input_data))
+    for promise in promises:
+        response = promise.result()
+        response_text = json.loads(response.text)
+        resolution_time = max(resolution_time, response.elapsed)
+        mean_inference_latency += response_text[1]
+        mean_peak_gpu_memory += response_text[2]
+        n_pads += response_text[3]
+        n_elems += response_text[4]
+        sequence_length += response_text[5]
+    mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}"
+    mean_sequence_length = sequence_length / SPAM_N_REQUESTS
+    resolution_time = round(resolution_time, 2)
+    mean_inference_latency = round(mean_inference_latency / SPAM_N_REQUESTS, 2)
+    mean_peak_gpu_memory = round(mean_peak_gpu_memory / SPAM_N_REQUESTS, 2)
+    return get_message_spam(resolution_time, mean_inference_latency, mean_peak_gpu_memory, mean_padding_ratio, mean_sequence_length)

defaults.py ADDED Viewed

	@@ -0,0 +1,38 @@

+defaults_vanilla_single = {
+    "status": 200,
+    "prediction": "Accepted",
+    "inf_latency": 20.77,
+    "peak_gpu_memory": 2717.36,
+    "end_to_end_latency": 93.65,
+}
+defaults_bt_single = {
+    "status": 200,
+    "prediction": "Accepted",
+    "inf_latency": 20.77,
+    "peak_gpu_memory": 2717.36,
+    "end_to_end_latency": 93.65,
+}
+defaults_vanilla_spam = {
+    "resolution_time": 2996.35,
+    "mean_inference_latency": 29.69,
+    "mean_peak_gpu_memory": 3620.9,
+    "mean_padding_ratio": 35.26,
+    "mean_sequence_length": 39.395,
+}
+defaults_bt_spam = {
+    "resolution_time": 2996.35,
+    "mean_inference_latency": 29.69,
+    "mean_peak_gpu_memory": 3620.9,
+    "mean_padding_ratio": 35.26,
+    "mean_sequence_length": 39.395,
+}
+SPAM_N_REQUESTS = 200
+BATCH_SIZE = 8  # fixed!
+HEADERS = {"Content-Type": "text/plain"}
+ADDRESS_VANILLA = "http://3.83.142.46:8080/predictions/my_tc"
+ADDRESS_BETTERTRANSFORMER = "http://3.95.36.2:8080/predictions/my_tc"

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ requests_futures

utils.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from requests_futures.sessions import FuturesSession
+import time
+class ElapsedFuturesSession(FuturesSession):
+    def request(self, method, url, hooks=None, *args, **kwargs):
+        start = time.time()
+        if hooks is None:
+            hooks = {}
+        def timing(r, *args, **kwargs):
+            r.elapsed = round((time.time() - start) * 1000, 2)
+        try:
+            if isinstance(hooks['response'], (list, tuple)):
+                # needs to be first so we don't time other hooks execution
+                hooks['response'].insert(0, timing)
+            else:
+                hooks['response'] = [timing, hooks['response']]
+        except KeyError:
+            hooks['response'] = timing
+        return super(ElapsedFuturesSession, self) \
+            .request(method, url, hooks=hooks, *args, **kwargs)