Felix Marty commited on
Commit
7d58e23
1 Parent(s): 4843fe3

hopefully stable

Browse files
Files changed (3) hide show
  1. app.py +79 -69
  2. backend.py +15 -19
  3. defaults.py +21 -21
app.py CHANGED
@@ -1,55 +1,66 @@
1
- import gradio as gr
2
-
3
  import json
4
- from backend import get_message_single, get_message_spam, send_single, send_spam, tokenizer
5
- from defaults import (
6
- ADDRESS_BETTERTRANSFORMER,
7
- ADDRESS_VANILLA,
8
- defaults_bt_single,
9
- defaults_bt_spam,
10
- defaults_vanilla_single,
11
- defaults_vanilla_spam,
12
- )
13
 
14
  import datasets
 
15
  import torch
16
 
17
- def dispatch_single(input_model_single, address_input_vanilla, address_input_bettertransformer):
 
 
 
 
 
 
 
 
 
18
  result_vanilla = send_single(input_model_single, address_input_vanilla)
19
- result_bettertransformer = send_single(input_model_single, address_input_bettertransformer)
20
-
 
 
21
  return result_vanilla, result_bettertransformer
22
 
23
- def dispatch_spam_artif(input_n_spam_artif, sequence_length, padding_ratio, address_input_vanilla, address_input_bettertransformer):
 
 
 
 
 
 
 
24
  sequence_length = int(sequence_length)
25
  input_n_spam_artif = int(input_n_spam_artif)
26
-
27
  inp_tokens = torch.randint(tokenizer.vocab_size - 1, (sequence_length,)) + 1
28
 
29
  n_pads = max(int(padding_ratio * len(inp_tokens)), 1)
30
- inp_tokens[- n_pads:] = 0
31
 
32
  inp_tokens[0] = 101
33
- inp_tokens[- n_pads - 1] = 102
34
-
35
  attention_mask = torch.zeros((sequence_length,), dtype=torch.int64)
36
- attention_mask[:- n_pads] = 1
37
-
38
- str_input = json.dumps({
39
- "input_ids": inp_tokens.cpu().tolist(),
40
- "attention_mask": attention_mask.cpu().tolist(),
41
- "pre_tokenized": True,
42
- })
43
-
 
 
44
  input_dataset = datasets.Dataset.from_dict(
45
  {"sentence": [str_input for _ in range(input_n_spam_artif)]}
46
  )
47
-
48
  result_vanilla = send_spam(input_dataset, address_input_vanilla)
49
  result_bettertransformer = send_spam(input_dataset, address_input_bettertransformer)
50
 
51
  return result_vanilla, result_bettertransformer
52
 
 
53
  TTILE_IMAGE = """
54
  <div
55
  style="
@@ -63,34 +74,17 @@ TTILE_IMAGE = """
63
  </div>
64
  """
65
 
66
- TITLE = """
67
- <div
68
- style="
69
- display: inline-flex;
70
- align-items: center;
71
- text-align: center;
72
- max-width: 1400px;
73
- gap: 0.8rem;
74
- font-size: 2.2rem;
75
- "
76
- >
77
- <h1 style="font-weight: 500; margin-bottom: 10px; margin-top: 10px;">
78
- Speed up your inference and support more workload with PyTorch's BetterTransformer 🤗
79
- </h1>
80
- </div>
81
- """
82
-
83
  with gr.Blocks() as demo:
84
  gr.HTML(TTILE_IMAGE)
85
- gr.HTML(TITLE)
 
 
86
 
87
  gr.Markdown(
88
  """
89
- Let's try out TorchServe + BetterTransformer!
90
-
91
- BetterTransformer is a stable feature made available with [PyTorch 1.13](https://pytorch.org/blog/PyTorch-1.13-release/) allowing to use a fastpath execution for encoder attention blocks.
92
 
93
- As a one-liner, you can convert your 🤗 Transformers models to use BetterTransformer thanks to the [🤗 Optimum](https://huggingface.co/docs/optimum/main/en/index) library:
94
 
95
  ```
96
  from optimum.bettertransformer import BetterTransformer
@@ -98,18 +92,13 @@ with gr.Blocks() as demo:
98
  better_model = BetterTransformer.transform(model)
99
  ```
100
 
101
- This Space is a demo of an **end-to-end** deployement of PyTorch eager-mode models, both with and without BetterTransformer. The goal is to see what are the benefits server-side and client-side of using BetterTransformer.
102
-
103
- ## Inference using...
104
- """
105
  )
106
 
107
- with gr.Row():
108
- with gr.Column(scale=50):
109
- gr.Markdown("### Vanilla Transformers + TorchServe")
110
- with gr.Column(scale=50):
111
- gr.Markdown("### BetterTransformer + TorchServe")
112
-
113
  address_input_vanilla = gr.Textbox(
114
  max_lines=1, label="ip vanilla", value=ADDRESS_VANILLA, visible=False
115
  )
@@ -124,30 +113,44 @@ with gr.Blocks() as demo:
124
  input_model_single = gr.Textbox(
125
  max_lines=1,
126
  label="Text",
127
- value="Expectations were low, enjoyment was high",
128
  )
129
 
130
  btn_single = gr.Button("Send single text request")
131
  with gr.Row():
132
  with gr.Column(scale=50):
 
133
  output_single_vanilla = gr.Markdown(
134
  label="Output single vanilla",
135
  value=get_message_single(**defaults_vanilla_single),
136
  )
137
  with gr.Column(scale=50):
 
138
  output_single_bt = gr.Markdown(
139
  label="Output single bt", value=get_message_single(**defaults_bt_single)
140
  )
141
 
142
  btn_single.click(
143
  fn=dispatch_single,
144
- inputs=[input_model_single, address_input_vanilla, address_input_bettertransformer],
 
 
 
 
145
  outputs=[output_single_vanilla, output_single_bt],
146
  )
147
 
 
 
 
 
 
 
 
 
148
  input_n_spam_artif = gr.Number(
149
  label="Number of inputs to send",
150
- value=8,
151
  )
152
  sequence_length = gr.Number(
153
  label="Sequence length (in tokens)",
@@ -155,28 +158,35 @@ with gr.Blocks() as demo:
155
  )
156
  padding_ratio = gr.Number(
157
  label="Padding ratio",
158
- value=0.5,
159
- )
160
- btn_spam_artif = gr.Button(
161
- "Spam text requests (using artificial data)"
162
  )
 
 
163
  with gr.Row():
164
  with gr.Column(scale=50):
 
165
  output_spam_vanilla_artif = gr.Markdown(
166
  label="Output spam vanilla",
167
  value=get_message_spam(**defaults_vanilla_spam),
168
  )
169
  with gr.Column(scale=50):
 
170
  output_spam_bt_artif = gr.Markdown(
171
  label="Output spam bt", value=get_message_spam(**defaults_bt_spam)
172
  )
173
 
174
  btn_spam_artif.click(
175
  fn=dispatch_spam_artif,
176
- inputs=[input_n_spam_artif, sequence_length, padding_ratio, address_input_vanilla, address_input_bettertransformer],
 
 
 
 
 
 
177
  outputs=[output_spam_vanilla_artif, output_spam_bt_artif],
178
  )
179
 
180
 
181
  demo.queue(concurrency_count=1)
182
- demo.launch()
 
 
 
1
  import json
 
 
 
 
 
 
 
 
 
2
 
3
  import datasets
4
+ import gradio as gr
5
  import torch
6
 
7
+ from backend import (get_message_single, get_message_spam, send_single,
8
+ send_spam, tokenizer)
9
+ from defaults import (ADDRESS_BETTERTRANSFORMER, ADDRESS_VANILLA,
10
+ defaults_bt_single, defaults_bt_spam,
11
+ defaults_vanilla_single, defaults_vanilla_spam)
12
+
13
+
14
+ def dispatch_single(
15
+ input_model_single, address_input_vanilla, address_input_bettertransformer
16
+ ):
17
  result_vanilla = send_single(input_model_single, address_input_vanilla)
18
+ result_bettertransformer = send_single(
19
+ input_model_single, address_input_bettertransformer
20
+ )
21
+
22
  return result_vanilla, result_bettertransformer
23
 
24
+
25
+ def dispatch_spam_artif(
26
+ input_n_spam_artif,
27
+ sequence_length,
28
+ padding_ratio,
29
+ address_input_vanilla,
30
+ address_input_bettertransformer,
31
+ ):
32
  sequence_length = int(sequence_length)
33
  input_n_spam_artif = int(input_n_spam_artif)
34
+
35
  inp_tokens = torch.randint(tokenizer.vocab_size - 1, (sequence_length,)) + 1
36
 
37
  n_pads = max(int(padding_ratio * len(inp_tokens)), 1)
38
+ inp_tokens[-n_pads:] = 0
39
 
40
  inp_tokens[0] = 101
41
+ inp_tokens[-n_pads - 1] = 102
42
+
43
  attention_mask = torch.zeros((sequence_length,), dtype=torch.int64)
44
+ attention_mask[:-n_pads] = 1
45
+
46
+ str_input = json.dumps(
47
+ {
48
+ "input_ids": inp_tokens.cpu().tolist(),
49
+ "attention_mask": attention_mask.cpu().tolist(),
50
+ "pre_tokenized": True,
51
+ }
52
+ )
53
+
54
  input_dataset = datasets.Dataset.from_dict(
55
  {"sentence": [str_input for _ in range(input_n_spam_artif)]}
56
  )
57
+
58
  result_vanilla = send_spam(input_dataset, address_input_vanilla)
59
  result_bettertransformer = send_spam(input_dataset, address_input_bettertransformer)
60
 
61
  return result_vanilla, result_bettertransformer
62
 
63
+
64
  TTILE_IMAGE = """
65
  <div
66
  style="
 
74
  </div>
75
  """
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  with gr.Blocks() as demo:
78
  gr.HTML(TTILE_IMAGE)
79
+ gr.Markdown(
80
+ "# Speed up your inference and support more workload with PyTorch's BetterTransformer 🤗"
81
+ )
82
 
83
  gr.Markdown(
84
  """
85
+ Let's try out [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) + [TorchServe](https://pytorch.org/serve/)!
 
 
86
 
87
+ BetterTransformer is a stable feature made available with [PyTorch 1.13](https://pytorch.org/blog/PyTorch-1.13-release/) allowing to use a fastpath execution for encoder attention blocks. Depending on your hardware, batch size, sequence length, padding ratio, it can bring large speedups at inference **at no cost in prediction quality**. As a one-liner, you can convert your 🤗 Transformers models to use BetterTransformer thanks to the integration in the [🤗 Optimum](https://github.com/huggingface/optimum) library:
88
 
89
  ```
90
  from optimum.bettertransformer import BetterTransformer
 
92
  better_model = BetterTransformer.transform(model)
93
  ```
94
 
95
+ This Space is a demo of an **end-to-end** deployement of PyTorch eager-mode models, both with and without BetterTransformer. The goal is to see what are the benefits server-side and client-side of using BetterTransformer. The model used is [`distilbert-base-uncased-finetuned-sst-2-english`](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english), and TorchServe is parametrized to use a maximum batch size of 8. **Beware:** you may be queued in case several persons use the Space at the same time.
96
+
97
+ For more details on the TorchServe implementation and to reproduce, see [this reference code](https://github.com/fxmarty/bettertransformer_demo). For more details on BetterTransformer, check out the [blog post on PyTorch's Medium](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2), and [the Optimum documentation](https://huggingface.co/docs/optimum/bettertransformer/overview)!"""
 
98
  )
99
 
100
+ gr.Markdown("## Single input scenario")
101
+
 
 
 
 
102
  address_input_vanilla = gr.Textbox(
103
  max_lines=1, label="ip vanilla", value=ADDRESS_VANILLA, visible=False
104
  )
 
113
  input_model_single = gr.Textbox(
114
  max_lines=1,
115
  label="Text",
116
+ value="Expectations were low, enjoyment was high. Although the music was not top level, the story was well-paced.",
117
  )
118
 
119
  btn_single = gr.Button("Send single text request")
120
  with gr.Row():
121
  with gr.Column(scale=50):
122
+ gr.Markdown("### Vanilla Transformers + TorchServe")
123
  output_single_vanilla = gr.Markdown(
124
  label="Output single vanilla",
125
  value=get_message_single(**defaults_vanilla_single),
126
  )
127
  with gr.Column(scale=50):
128
+ gr.Markdown("### BetterTransformer + TorchServe")
129
  output_single_bt = gr.Markdown(
130
  label="Output single bt", value=get_message_single(**defaults_bt_single)
131
  )
132
 
133
  btn_single.click(
134
  fn=dispatch_single,
135
+ inputs=[
136
+ input_model_single,
137
+ address_input_vanilla,
138
+ address_input_bettertransformer,
139
+ ],
140
  outputs=[output_single_vanilla, output_single_bt],
141
  )
142
 
143
+ gr.Markdown(
144
+ """
145
+ **Beware that the end-to-end latency can be impacted by a different ping time between the two servers.**
146
+
147
+ ## Heavy workload scenario
148
+ """
149
+ )
150
+
151
  input_n_spam_artif = gr.Number(
152
  label="Number of inputs to send",
153
+ value=80,
154
  )
155
  sequence_length = gr.Number(
156
  label="Sequence length (in tokens)",
 
158
  )
159
  padding_ratio = gr.Number(
160
  label="Padding ratio",
161
+ value=0.7,
 
 
 
162
  )
163
+ btn_spam_artif = gr.Button("Spam text requests (using artificial data)")
164
+
165
  with gr.Row():
166
  with gr.Column(scale=50):
167
+ gr.Markdown("### Vanilla Transformers + TorchServe")
168
  output_spam_vanilla_artif = gr.Markdown(
169
  label="Output spam vanilla",
170
  value=get_message_spam(**defaults_vanilla_spam),
171
  )
172
  with gr.Column(scale=50):
173
+ gr.Markdown("### BetterTransformer + TorchServe")
174
  output_spam_bt_artif = gr.Markdown(
175
  label="Output spam bt", value=get_message_spam(**defaults_bt_spam)
176
  )
177
 
178
  btn_spam_artif.click(
179
  fn=dispatch_spam_artif,
180
+ inputs=[
181
+ input_n_spam_artif,
182
+ sequence_length,
183
+ padding_ratio,
184
+ address_input_vanilla,
185
+ address_input_bettertransformer,
186
+ ],
187
  outputs=[output_spam_vanilla_artif, output_spam_bt_artif],
188
  )
189
 
190
 
191
  demo.queue(concurrency_count=1)
192
+ demo.launch()
backend.py CHANGED
@@ -1,16 +1,12 @@
1
  import json
 
2
 
3
- from defaults import (
4
- ADDRESS_BETTERTRANSFORMER,
5
- ADDRESS_VANILLA,
6
- HEADERS,
7
- MODEL_NAME,
8
- )
9
  from requests_futures.sessions import FuturesSession
10
-
11
  from transformers import AutoTokenizer
12
 
13
- import time
 
14
 
15
  RETURN_MESSAGE_SINGLE = """
16
  Inference statistics:
@@ -23,10 +19,8 @@ Inference statistics:
23
  * Padding ratio: 0.0 %
24
  """
25
 
26
- RETURN_MESSAGE_SPAM = (
27
- """
28
- Processing """
29
- + "NUMBER REQ" + """ inputs sent asynchronously. Grab a coffee.
30
 
31
  Inference statistics:
32
 
@@ -37,10 +31,10 @@ Inference statistics:
37
  * Mean sequence length: {4} tokens
38
  * Effective mean batch size: {5}
39
  """
40
- )
41
 
42
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
43
 
 
44
  def get_message_single(
45
  status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs
46
  ):
@@ -70,14 +64,16 @@ def get_message_spam(
70
 
71
  SESSION = FuturesSession()
72
 
73
- def send_single(input_model_vanilla, address: str):
 
74
  assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
75
 
76
  # should not take more than 10 s, so timeout if that's the case
77
- start = time.time()
78
- promise = SESSION.post(
79
- address, headers=HEADERS, data=input_model_vanilla.encode("utf-8"), timeout=10
80
  )
 
 
81
 
82
  try:
83
  response = promise.result() # resolve ASAP
@@ -98,7 +94,7 @@ def send_single(input_model_vanilla, address: str):
98
  )
99
 
100
 
101
- def send_spam(inp, address: str):
102
  assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
103
 
104
  mean_inference_latency = 0
@@ -129,7 +125,7 @@ def send_spam(inp, address: str):
129
  response = promise.result() # resolve ASAP
130
  except Exception as e:
131
  return f"{e}"
132
-
133
  end = max(time.time(), end)
134
 
135
  # then other metrics
 
1
  import json
2
+ import time
3
 
4
+ from datasets import Dataset
 
 
 
 
 
5
  from requests_futures.sessions import FuturesSession
 
6
  from transformers import AutoTokenizer
7
 
8
+ from defaults import (ADDRESS_BETTERTRANSFORMER, ADDRESS_VANILLA, HEADERS,
9
+ MODEL_NAME)
10
 
11
  RETURN_MESSAGE_SINGLE = """
12
  Inference statistics:
 
19
  * Padding ratio: 0.0 %
20
  """
21
 
22
+ RETURN_MESSAGE_SPAM = """
23
+ Processing inputs sent asynchronously. Grab a coffee.
 
 
24
 
25
  Inference statistics:
26
 
 
31
  * Mean sequence length: {4} tokens
32
  * Effective mean batch size: {5}
33
  """
 
34
 
35
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
36
 
37
+
38
  def get_message_single(
39
  status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs
40
  ):
 
64
 
65
  SESSION = FuturesSession()
66
 
67
+
68
+ def send_single(input_model_vanilla: str, address: str):
69
  assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
70
 
71
  # should not take more than 10 s, so timeout if that's the case
72
+ inp = json.dumps({"text": input_model_vanilla, "pre_tokenized": False}).encode(
73
+ "utf-8"
 
74
  )
75
+ start = time.time()
76
+ promise = SESSION.post(address, headers=HEADERS, data=inp, timeout=10)
77
 
78
  try:
79
  response = promise.result() # resolve ASAP
 
94
  )
95
 
96
 
97
+ def send_spam(inp: Dataset, address: str):
98
  assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
99
 
100
  mean_inference_latency = 0
 
125
  response = promise.result() # resolve ASAP
126
  except Exception as e:
127
  return f"{e}"
128
+
129
  end = max(time.time(), end)
130
 
131
  # then other metrics
defaults.py CHANGED
@@ -1,35 +1,35 @@
1
  defaults_vanilla_single = {
2
  "status": 200,
3
- "prediction": "Accepted",
4
- "inf_latency": 20.77,
5
- "peak_gpu_memory": 2717.36,
6
- "end_to_end_latency": 93.65,
7
  }
8
 
9
  defaults_bt_single = {
10
  "status": 200,
11
- "prediction": "Accepted",
12
- "inf_latency": 20.77,
13
- "peak_gpu_memory": 2717.36,
14
- "end_to_end_latency": 93.65,
15
  }
16
 
17
  defaults_vanilla_spam = {
18
- "throughput": 20,
19
- "mean_inference_latency": 29.69,
20
- "mean_peak_gpu_memory": 3620.9,
21
- "mean_padding_ratio": 35.26,
22
- "mean_sequence_length": 39.395,
23
- "effective_batch_size": 8,
24
  }
25
 
26
  defaults_bt_spam = {
27
- "throughput": 20,
28
- "mean_inference_latency": 29.69,
29
- "mean_peak_gpu_memory": 3620.9,
30
- "mean_padding_ratio": 35.26,
31
- "mean_sequence_length": 39.395,
32
- "effective_batch_size": 8,
33
  }
34
 
35
  BATCH_SIZE = 8 # fixed!
@@ -37,4 +37,4 @@ BATCH_SIZE = 8 # fixed!
37
  HEADERS = {"Content-Type": "text/plain"}
38
  ADDRESS_VANILLA = "http://3.83.142.46:8080/predictions/my_tc"
39
  ADDRESS_BETTERTRANSFORMER = "http://3.95.136.2:8080/predictions/my_tc"
40
- MODEL_NAME = "distilbert-base-uncased-finetuned-sst-2-english"
 
1
  defaults_vanilla_single = {
2
  "status": 200,
3
+ "prediction": "Positive",
4
+ "inf_latency": 7.66,
5
+ "peak_gpu_memory": 2706.21,
6
+ "end_to_end_latency": 309.65,
7
  }
8
 
9
  defaults_bt_single = {
10
  "status": 200,
11
+ "prediction": "Positive",
12
+ "inf_latency": 6.01,
13
+ "peak_gpu_memory": 2706.22,
14
+ "end_to_end_latency": 303.53,
15
  }
16
 
17
  defaults_vanilla_spam = {
18
+ "throughput": 28.04,
19
+ "mean_inference_latency": 24.43,
20
+ "mean_peak_gpu_memory": 2907.92,
21
+ "mean_padding_ratio": 69.53,
22
+ "mean_sequence_length": 128.0,
23
+ "effective_batch_size": 4.3,
24
  }
25
 
26
  defaults_bt_spam = {
27
+ "throughput": 38.53,
28
+ "mean_inference_latency": 12.73,
29
+ "mean_peak_gpu_memory": 2761.64,
30
+ "mean_padding_ratio": 69.53,
31
+ "mean_sequence_length": 128.0,
32
+ "effective_batch_size": 4.7,
33
  }
34
 
35
  BATCH_SIZE = 8 # fixed!
 
37
  HEADERS = {"Content-Type": "text/plain"}
38
  ADDRESS_VANILLA = "http://3.83.142.46:8080/predictions/my_tc"
39
  ADDRESS_BETTERTRANSFORMER = "http://3.95.136.2:8080/predictions/my_tc"
40
+ MODEL_NAME = "distilbert-base-uncased-finetuned-sst-2-english"