Felix Marty commited on
Commit
f5a63b8
1 Parent(s): bf38ec8

fix throughput measure

Browse files
Files changed (3) hide show
  1. app.py +1 -15
  2. backend.py +14 -10
  3. utils.py +0 -26
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import gradio as gr
2
 
3
  import json
4
- import math
5
  from backend import get_message_single, get_message_spam, send_single, send_spam, tokenizer
6
  from defaults import (
7
  ADDRESS_BETTERTRANSFORMER,
@@ -10,7 +9,6 @@ from defaults import (
10
  defaults_bt_spam,
11
  defaults_vanilla_single,
12
  defaults_vanilla_spam,
13
- BATCH_SIZE,
14
  )
15
 
16
  import datasets
@@ -22,17 +20,6 @@ def dispatch_single(input_model_single, address_input_vanilla, address_input_bet
22
 
23
  return result_vanilla, result_bettertransformer
24
 
25
- def dispatch_spam(input_n_spam, address_input_vanilla, address_input_bettertransformer):
26
- input_n_spam = int(input_n_spam)
27
- assert input_n_spam <= len(data)
28
-
29
- inp = data.shuffle().select(range(input_n_spam))
30
-
31
- result_vanilla = send_spam(inp, address_input_vanilla)
32
- result_bettertransformer = send_spam(inp, address_input_bettertransformer)
33
-
34
- return result_vanilla, result_bettertransformer
35
-
36
  def dispatch_spam_artif(input_n_spam_artif, sequence_length, padding_ratio, address_input_vanilla, address_input_bettertransformer):
37
  sequence_length = int(sequence_length)
38
  input_n_spam_artif = int(input_n_spam_artif)
@@ -44,7 +31,6 @@ def dispatch_spam_artif(input_n_spam_artif, sequence_length, padding_ratio, addr
44
 
45
  inp_tokens[0] = 101
46
  inp_tokens[- n_pads - 1] = 102
47
- #inp_tokens = inp_tokens.unsqueeze(0).repeat(BATCH_SIZE, 1)
48
 
49
  attention_mask = torch.zeros((sequence_length,), dtype=torch.int64)
50
  attention_mask[:- n_pads] = 1
@@ -63,7 +49,7 @@ def dispatch_spam_artif(input_n_spam_artif, sequence_length, padding_ratio, addr
63
  result_bettertransformer = send_spam(input_dataset, address_input_bettertransformer)
64
 
65
  return result_vanilla, result_bettertransformer
66
-
67
  TTILE_IMAGE = """
68
  <div
69
  style="
 
1
  import gradio as gr
2
 
3
  import json
 
4
  from backend import get_message_single, get_message_spam, send_single, send_spam, tokenizer
5
  from defaults import (
6
  ADDRESS_BETTERTRANSFORMER,
 
9
  defaults_bt_spam,
10
  defaults_vanilla_single,
11
  defaults_vanilla_spam,
 
12
  )
13
 
14
  import datasets
 
20
 
21
  return result_vanilla, result_bettertransformer
22
 
 
 
 
 
 
 
 
 
 
 
 
23
  def dispatch_spam_artif(input_n_spam_artif, sequence_length, padding_ratio, address_input_vanilla, address_input_bettertransformer):
24
  sequence_length = int(sequence_length)
25
  input_n_spam_artif = int(input_n_spam_artif)
 
31
 
32
  inp_tokens[0] = 101
33
  inp_tokens[- n_pads - 1] = 102
 
34
 
35
  attention_mask = torch.zeros((sequence_length,), dtype=torch.int64)
36
  attention_mask[:- n_pads] = 1
 
49
  result_bettertransformer = send_spam(input_dataset, address_input_bettertransformer)
50
 
51
  return result_vanilla, result_bettertransformer
52
+
53
  TTILE_IMAGE = """
54
  <div
55
  style="
backend.py CHANGED
@@ -6,11 +6,11 @@ from defaults import (
6
  HEADERS,
7
  MODEL_NAME,
8
  )
9
- from utils import ElapsedFuturesSession
10
 
11
  from transformers import AutoTokenizer
12
 
13
- import numpy as np
14
 
15
  RETURN_MESSAGE_SINGLE = """
16
  Inference statistics:
@@ -68,18 +68,20 @@ def get_message_spam(
68
  )
69
 
70
 
71
- SESSION = ElapsedFuturesSession()
72
 
73
  def send_single(input_model_vanilla, address: str):
74
  assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
75
 
76
  # should not take more than 10 s, so timeout if that's the case
 
77
  promise = SESSION.post(
78
  address, headers=HEADERS, data=input_model_vanilla.encode("utf-8"), timeout=10
79
  )
80
 
81
  try:
82
  response = promise.result() # resolve ASAP
 
83
  except Exception as e:
84
  return f"{e}"
85
 
@@ -89,7 +91,7 @@ def send_single(input_model_vanilla, address: str):
89
  prediction = response_text[0]
90
  inf_latency = response_text[1]
91
  peak_gpu_memory = response_text[2]
92
- end_to_end_latency = response.elapsed
93
 
94
  return get_message_single(
95
  status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency
@@ -99,8 +101,6 @@ def send_single(input_model_vanilla, address: str):
99
  def send_spam(inp, address: str):
100
  assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
101
 
102
- # data = "this is positive lol" #TODO: use dynamic data with padding
103
- max_resolution_time = 0
104
  mean_inference_latency = 0
105
  mean_peak_gpu_memory = 0
106
 
@@ -113,6 +113,7 @@ def send_spam(inp, address: str):
113
 
114
  n_inputs = len(inp)
115
 
 
116
  for i in range(n_inputs):
117
  input_data = inp[i]["sentence"].encode("utf-8")
118
 
@@ -121,18 +122,21 @@ def send_spam(inp, address: str):
121
  SESSION.post(address, headers=HEADERS, data=input_data, timeout=15)
122
  )
123
 
 
 
124
  for promise in promises:
125
  try:
126
  response = promise.result() # resolve ASAP
127
  except Exception as e:
128
  return f"{e}"
 
 
129
 
 
 
130
  response = promise.result()
131
-
132
  response_text = json.loads(response.text)
133
 
134
- max_resolution_time = max(max_resolution_time, response.elapsed)
135
-
136
  mean_inference_latency += response_text[1]
137
  mean_peak_gpu_memory += response_text[2]
138
  n_pads += response_text[3]
@@ -140,7 +144,7 @@ def send_spam(inp, address: str):
140
  sequence_length += response_text[5]
141
  effective_batch_size += response_text[6]
142
 
143
- throughput = n_inputs / (max_resolution_time * 1e-3)
144
  mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}"
145
  mean_sequence_length = sequence_length / n_inputs
146
  effective_batch_size = effective_batch_size / n_inputs
 
6
  HEADERS,
7
  MODEL_NAME,
8
  )
9
+ from requests_futures.sessions import FuturesSession
10
 
11
  from transformers import AutoTokenizer
12
 
13
+ import time
14
 
15
  RETURN_MESSAGE_SINGLE = """
16
  Inference statistics:
 
68
  )
69
 
70
 
71
+ SESSION = FuturesSession()
72
 
73
  def send_single(input_model_vanilla, address: str):
74
  assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
75
 
76
  # should not take more than 10 s, so timeout if that's the case
77
+ start = time.time()
78
  promise = SESSION.post(
79
  address, headers=HEADERS, data=input_model_vanilla.encode("utf-8"), timeout=10
80
  )
81
 
82
  try:
83
  response = promise.result() # resolve ASAP
84
+ end = time.time()
85
  except Exception as e:
86
  return f"{e}"
87
 
 
91
  prediction = response_text[0]
92
  inf_latency = response_text[1]
93
  peak_gpu_memory = response_text[2]
94
+ end_to_end_latency = round((end - start) * 1e3, 2)
95
 
96
  return get_message_single(
97
  status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency
 
101
  def send_spam(inp, address: str):
102
  assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
103
 
 
 
104
  mean_inference_latency = 0
105
  mean_peak_gpu_memory = 0
106
 
 
113
 
114
  n_inputs = len(inp)
115
 
116
+ start = time.time()
117
  for i in range(n_inputs):
118
  input_data = inp[i]["sentence"].encode("utf-8")
119
 
 
122
  SESSION.post(address, headers=HEADERS, data=input_data, timeout=15)
123
  )
124
 
125
+ # to measure throughput first
126
+ end = 0
127
  for promise in promises:
128
  try:
129
  response = promise.result() # resolve ASAP
130
  except Exception as e:
131
  return f"{e}"
132
+
133
+ end = max(time.time(), end)
134
 
135
+ # then other metrics
136
+ for promise in promises:
137
  response = promise.result()
 
138
  response_text = json.loads(response.text)
139
 
 
 
140
  mean_inference_latency += response_text[1]
141
  mean_peak_gpu_memory += response_text[2]
142
  n_pads += response_text[3]
 
144
  sequence_length += response_text[5]
145
  effective_batch_size += response_text[6]
146
 
147
+ throughput = n_inputs / (end - start)
148
  mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}"
149
  mean_sequence_length = sequence_length / n_inputs
150
  effective_batch_size = effective_batch_size / n_inputs
utils.py CHANGED
@@ -1,26 +0,0 @@
1
- import time
2
-
3
- from requests_futures.sessions import FuturesSession
4
-
5
-
6
- class ElapsedFuturesSession(FuturesSession):
7
- def request(self, method, url, hooks=None, *args, **kwargs):
8
- start = time.time()
9
- if hooks is None:
10
- hooks = {}
11
-
12
- def timing(r, *args, **kwargs):
13
- r.elapsed = round((time.time() - start) * 1000, 2)
14
-
15
- try:
16
- if isinstance(hooks["response"], (list, tuple)):
17
- # needs to be first so we don't time other hooks execution
18
- hooks["response"].insert(0, timing)
19
- else:
20
- hooks["response"] = [timing, hooks["response"]]
21
- except KeyError:
22
- hooks["response"] = timing
23
-
24
- return super(ElapsedFuturesSession, self).request(
25
- method, url, hooks=hooks, *args, **kwargs
26
- )