Jae-Won Chung commited on
Commit
395a4be
1 Parent(s): 315ec00

Record a bit more metrics

Browse files
tests/colosseum/controller_load_test.py CHANGED
@@ -1,7 +1,10 @@
1
  import os
 
 
2
  import time
3
  import random
4
  import itertools
 
5
  import multiprocessing as mp
6
 
7
  import tyro
@@ -70,42 +73,89 @@ PROMPTS = [
70
  ] * 2
71
 
72
 
73
- def request(prompt: str) -> tuple[str, str, str, str, float]:
74
  time.sleep(random.random() * 5)
75
- client = ControllerClient(CONTROLLER_ADDR, timeout=30)
76
 
77
  response_a, response_b = "", ""
 
 
78
  start_time = time.monotonic()
79
- for resp_a, resp_b in itertools.zip_longest(
80
  client.prompt(prompt, index=0),
81
  client.prompt(prompt, index=1),
82
- ):
 
 
83
  if resp_a is not None:
 
84
  response_a += resp_a
85
  if resp_b is not None:
 
86
  response_b += resp_b
87
 
88
  latency = time.monotonic() - start_time
89
- return client.request_id, prompt, response_a, response_b, latency
 
90
 
91
 
92
- def main(concurrency: int = len(PROMPTS), logfile: str = "load_test_results.csv"):
93
- latencies = []
 
 
 
 
 
94
 
95
- start_time = time.monotonic()
96
- with mp.Pool(processes=concurrency) as pool:
97
- for request_id, prompt, response_a, response_b, latency in pool.imap_unordered(request, PROMPTS):
98
- latencies.append(latency)
99
- print(f"Request ID {request_id} finished, {latency=:.2f}s")
100
-
101
- total_time = time.monotonic() - start_time
102
- average_latency = sum(latencies) / len(latencies)
103
- requests_per_second = len(latencies) / total_time
104
- print(f"Total time: {total_time:.2f}s")
105
- print(f"Average latency: {average_latency:.2f}s")
106
- print(f"Requests per second: {requests_per_second:.2f}")
107
- with open(logfile, "a") as f:
108
- f.write(f"{concurrency},{total_time},{average_latency},{requests_per_second}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
 
111
  if __name__ == "__main__":
 
1
  import os
2
+ import csv
3
+ import json
4
  import time
5
  import random
6
  import itertools
7
+ from statistics import quantiles
8
  import multiprocessing as mp
9
 
10
  import tyro
 
73
  ] * 2
74
 
75
 
76
+ def request(prompt: str) -> tuple[str, float, float, float]:
77
  time.sleep(random.random() * 5)
78
+ client = ControllerClient(CONTROLLER_ADDR, timeout=60)
79
 
80
  response_a, response_b = "", ""
81
+ first_token_latency = -1.0
82
+ num_tokens = 0
83
  start_time = time.monotonic()
84
+ for i, (resp_a, resp_b) in enumerate(itertools.zip_longest(
85
  client.prompt(prompt, index=0),
86
  client.prompt(prompt, index=1),
87
+ )):
88
+ if i == 0:
89
+ first_token_latency = time.monotonic() - start_time
90
  if resp_a is not None:
91
+ num_tokens += 1
92
  response_a += resp_a
93
  if resp_b is not None:
94
+ num_tokens += 1
95
  response_b += resp_b
96
 
97
  latency = time.monotonic() - start_time
98
+ tokens_per_second = num_tokens / latency
99
+ return client.request_id, latency, first_token_latency, tokens_per_second
100
 
101
 
102
+ def main(
103
+ concurrencies: list[int] = [10],
104
+ result_csv: str = "load_test_results.csv",
105
+ ftl_json: str = "ftl_dist.json",
106
+ ):
107
+ data = []
108
+ ftl_dist = {}
109
 
110
+ for concurrency in concurrencies:
111
+ latencies = []
112
+ first_token_latencies = []
113
+ tps = []
114
+
115
+ start_time = time.monotonic()
116
+ with mp.Pool(processes=concurrency) as pool:
117
+ for request_id, latency, first_token_latency, tokens_per_second in pool.imap_unordered(request, PROMPTS):
118
+ latencies.append(latency)
119
+ first_token_latencies.append(first_token_latency)
120
+ tps.append(tokens_per_second)
121
+ print(f"Request ID {request_id} finished, {latency=:.2f}s, {first_token_latency=:.2f}s, {tokens_per_second=:.2f} tokens/s")
122
+
123
+ total_time = time.monotonic() - start_time
124
+ average_latency = sum(latencies) / len(latencies)
125
+ average_first_token_latency = sum(first_token_latencies) / len(first_token_latencies)
126
+ first_token_latency_quartiles = quantiles(first_token_latencies, n=10)
127
+ ftl_dist[concurrency] = first_token_latencies
128
+ average_tokens_per_second = sum(tps) / len(tps)
129
+ requests_per_second = len(latencies) / total_time
130
+ print(f"Total time: {total_time:.2f}s")
131
+ print(f"Average latency: {average_latency:.2f}s")
132
+ print(f"Average first token latency: {average_first_token_latency:.2f}s")
133
+ print(f"Average tokens per second: {average_tokens_per_second:.2f}")
134
+ print(f"Requests per second: {requests_per_second:.2f}")
135
+ print(f"First token latency quartiles: {first_token_latency_quartiles}")
136
+ data.append((
137
+ concurrency,
138
+ total_time,
139
+ average_latency,
140
+ average_first_token_latency,
141
+ average_tokens_per_second,
142
+ requests_per_second,
143
+ ))
144
+
145
+ with open(result_csv, "w") as f:
146
+ writer = csv.writer(f)
147
+ writer.writerow((
148
+ "concurrency",
149
+ "total_time",
150
+ "average_latency",
151
+ "average_first_token_latency",
152
+ "average_tokens_per_second",
153
+ "requests_per_second",
154
+ ))
155
+ writer.writerows(data)
156
+
157
+ with open(ftl_json, "w") as f:
158
+ json.dump(ftl_dist, f)
159
 
160
 
161
  if __name__ == "__main__":