Julien Simon commited on
Commit
efaad9e
1 Parent(s): 7200b01

- Add Meraj and SuperNova

Browse files
Files changed (1) hide show
  1. results.py +601 -550
results.py CHANGED
@@ -1,559 +1,610 @@
1
  results = {
2
  "models": [
3
- {"name": "Arcee-Meraj",
4
- "modelType": "Qwen2 72B"
5
- },
6
- {
7
- "name": "Arcee-Nova",
8
- "modelType": "Qwen2 72B",
9
- "notes": "",
10
- "configurations": [
11
- {
12
- "region": "us-west-2",
13
- "instanceType": "g4dn.12xlarge",
14
- "cloud": "AWS",
15
- "gpu": "4xNVIDIA T4",
16
- "gpuRAM": "64 GB",
17
- "quantization": "bitsandbytes-nf4",
18
- "tgi": "TGI 2.2.0",
19
- "status": "KO",
20
- "tokensPerSecond": "-",
21
- "notes": "Flash Attention requires Ampere GPUs or newer"
22
- },
23
- {
24
- "region": "us-west-2",
25
- "instanceType": "g5.12xlarge",
26
- "cloud": "AWS",
27
- "gpu": "4xNVIDIA A10G",
28
- "gpuRAM": "96 GB",
29
  "configurations": [
30
- {
31
- "quantization": "bitsandbytes-nf4",
32
- "tgi": "TGI 2.2.0",
33
- "status": "OK",
34
- "tokensPerSecond": "12"
35
- },
36
- {
37
- "quantization": "bitsandbytes-fp4",
38
- "tgi": "TGI 2.2.0",
39
- "status": "OK",
40
- "tokensPerSecond": "12"
41
- },
42
- {
43
- "quantization": "bitsandbytes (int8)",
44
- "tgi": "TGI 2.2.0",
45
- "status": "KO",
46
- "tokensPerSecond": "-",
47
- "notes": "CUDA OOM"
48
- },
49
- {
50
- "quantization": "eetq (int8)",
51
- "tgi": "TGI 2.2.0",
52
- "status": "KO",
53
- "tokensPerSecond": "-",
54
- "notes": "[FT Error] Heurisitc failed to find a valid config."
55
- }
56
- ]
57
- },
58
- {
59
- "region": "us-west-2",
60
- "instanceType": "g5.48xlarge",
61
- "cloud": "AWS",
62
- "gpu": "8xNVIDIA A10G",
63
- "gpuRAM": "192 GB",
64
  "configurations": [
65
- {
66
- "quantization": "none",
67
- "tgi": "TGI 2.2.0",
68
- "status": "KO",
69
- "tokensPerSecond": "-",
70
- "notes": "CUDA OOM (but g6.48xlarge works!)"
71
- },
72
- {
73
- "quantization": "bitsandbytes-nf4",
74
- "tgi": "TGI 2.2.0",
75
- "status": "OK",
76
- "tokensPerSecond": "12.3"
77
- },
78
- {
79
- "quantization": "bitsandbytes-fp4",
80
- "tgi": "TGI 2.2.0",
81
- "status": "OK",
82
- "tokensPerSecond": "12.5"
83
- },
84
- {
85
- "quantization": "bitsandbytes (int8)",
86
- "tgi": "TGI 2.2.0",
87
- "status": "KO",
88
- "tokensPerSecond": "-",
89
- "notes": "The model deploys, but inference times out."
90
- }
91
- ]
92
- },
93
- {
94
- "region": "us-west-2",
95
- "instanceType": "g6.12xlarge",
96
- "cloud": "AWS",
97
- "gpu": "4xNVIDIA L4",
98
- "gpuRAM": "96 GB",
99
  "configurations": [
100
- {
101
- "quantization": "bitsandbytes-nf4",
102
- "tgi": "TGI 2.2.0",
103
- "status": "OK",
104
- "tokensPerSecond": "1.5-2",
105
- "notes": "Too slow, timeouts are likely"
106
- },
107
- {
108
- "quantization": "bitsandbytes-fp4",
109
- "tgi": "TGI 2.2.0",
110
- "status": "OK",
111
- "tokensPerSecond": "2",
112
- "notes": "Too slow, timeouts are likely"
113
- },
114
- {
115
- "quantization": "bitsandbytes (int8)",
116
- "tgi": "TGI 2.2.0",
117
- "status": "KO",
118
- "tokensPerSecond": "-",
119
- "notes": "CUDA OOM"
120
- }
121
- ]
122
- },
123
- {
124
- "region": "us-west-2",
125
- "instanceType": "g6.48xlarge",
126
- "cloud": "AWS",
127
- "gpu": "8xNVIDIA L4",
128
- "gpuRAM": "192 GB",
129
- "quantization": "none",
130
- "tgi": "TGI 2.2.0",
131
- "status": "OK",
132
- "tokensPerSecond": "12"
133
- },
134
- {
135
- "region": "us-west-2",
136
- "instanceType": "p4d.24xlarge",
137
- "cloud": "AWS",
138
- "gpu": "8xNVIDIA A100",
139
- "gpuRAM": "320 GB",
140
- "quantization": "none",
141
- "tgi": "TGI 2.2.0",
142
- "status": "OK",
143
- "tokensPerSecond": "40",
144
- "notes": "\"MAX_INPUT_LENGTH\": \"16384\", \"MAX_TOTAL_TOKENS\": \"32768\","
145
- },
146
- {
147
- "region": "us-west-2",
148
- "instanceType": "p4de.24xlarge",
149
- "cloud": "AWS",
150
- "gpu": "8xNVIDIA A100",
151
- "gpuRAM": "320 GB",
152
- "quantization": "none",
153
- "tgi": "TGI 2.2.0",
154
- "status": "waiting for quota"
155
- },
156
- {
157
- "region": "us-west-2",
158
- "instanceType": "p5.48xlarge",
159
- "cloud": "AWS",
160
- "gpu": "8xNVIDIA H100",
161
- "gpuRAM": "640GB",
162
- "quantization": "none",
163
- "tgi": "TGI 2.2.0",
164
- "status": "OK",
165
- "tokensPerSecond": "58",
166
- "notes": "\"MAX_INPUT_LENGTH\": \"16384\", \"MAX_TOTAL_TOKENS\": \"32768\","
167
- },
168
- {
169
- "region": "us-west-2",
170
- "instanceType": "inf2.*",
171
- "cloud": "AWS",
172
- "gpu": "-",
173
- "tgi": "TGI 2.2.0",
174
- "status": "not supported",
175
- "tokensPerSecond": "-",
176
- "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO"
177
- }
178
- ]
179
- },
180
- {
181
- "name": "Llama-Spark",
182
- "modelType": "Llama 3.1 8B",
183
- "configurations": [
184
- {
185
- "region": "us-west-2",
186
- "instanceType": "g5.2xlarge",
187
- "cloud": "AWS",
188
- "gpu": "1xNVIDIA A10G",
189
- "gpuRAM": "24 GB",
190
- "quantization": "none",
191
- "tgi": "TGI 2.2.0",
192
- "status": "OK",
193
- "tokensPerSecond": "29",
194
- "notes": "4K/8K fails"
195
- },
196
- {
197
- "region": "us-west-2",
198
- "instanceType": "g5.12xlarge",
199
- "cloud": "AWS",
200
- "gpu": "4xNVIDIA A10G",
201
- "gpuRAM": "96 GB",
202
- "quantization": "none",
203
- "tgi": "TGI 2.2.0",
204
- "status": "OK",
205
- "tokensPerSecond": "85",
206
- "notes": "\"MAX_INPUT_TOKENS\": \"16384\", \"MAX_TOTAL_TOKENS\": \"32768\","
207
- },
208
- {
209
- "region": "us-west-2",
210
- "instanceType": "g5.48xlarge",
211
- "cloud": "AWS",
212
- "gpu": "8xNVIDIA A10G",
213
- "gpuRAM": "192 GB",
214
- "quantization": "none",
215
- "tgi": "TGI 2.2.0",
216
- "status": "OK",
217
- "tokensPerSecond": "105",
218
- "notes": "\"MAX_INPUT_TOKENS\": \"20480\", \"MAX_TOTAL_TOKENS\": \"40960\"\n\n32K/64K fails"
219
- },
220
- {
221
- "region": "us-west-2",
222
- "instanceType": "g6.2xlarge",
223
- "cloud": "AWS",
224
- "gpu": "1xNVIDIA L4",
225
- "gpuRAM": "24 GB",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  "configurations": [
227
- {
228
- "quantization": "none",
229
- "tgi": "TGI 2.2.0",
230
- "status": "OK",
231
- "tokensPerSecond": "15"
232
- },
233
- {
234
- "quantization": "fp8",
235
- "tgi": "TGI 2.2.0"
236
- }
237
- ]
238
- },
239
- {
240
- "region": "us-west-2",
241
- "instanceType": "g6.12xlarge",
242
- "cloud": "AWS",
243
- "gpu": "4xNVIDIA L4",
244
- "gpuRAM": "96 GB",
245
- "quantization": "none",
246
- "tgi": "TGI 2.2.0",
247
- "status": "OK",
248
- "tokensPerSecond": "51",
249
- "notes": "same as g5?"
250
- },
251
- {
252
- "region": "us-west-2",
253
- "instanceType": "g6.48xlarge",
254
- "cloud": "AWS",
255
- "gpu": "8xNVIDIA L4",
256
- "gpuRAM": "192 GB",
257
- "quantization": "none",
258
- "tgi": "TGI 2.2.0",
259
- "status": "OK",
260
- "tokensPerSecond": "81",
261
- "notes": "same as g5?"
262
- },
263
- {
264
- "region": "us-west-2",
265
- "instanceType": "g6e.2xlarge",
266
- "cloud": "AWS",
267
- "gpu": "1xNVIDIA L40S",
268
- "gpuRAM": "48 GB",
269
- "quantization": "none",
270
- "tgi": "TGI 2.2.0",
271
- "status": "OK",
272
- "tokensPerSecond": "42"
273
- },
274
- {
275
- "region": "us-west-2",
276
- "instanceType": "p4d.24xlarge",
277
- "cloud": "AWS",
278
- "gpu": "4xNVIDIA A100",
279
- "gpuRAM": "320 GB",
280
- "quantization": "none",
281
- "tgi": "TGI 2.2.0",
282
- "status": "OK",
283
- "tokensPerSecond": "145",
284
- "notes": "\"MAX_INPUT_TOKENS\": \"40960\", \"MAX_TOTAL_TOKENS\": \"81920\"\n\n64K/128K fails (even with 4-bit)"
285
- },
286
- {
287
- "region": "us-west-2",
288
- "instanceType": "inf2.*",
289
- "cloud": "AWS",
290
- "gpu": "-",
291
- "status": "not supported",
292
- "tokensPerSecond": "-",
293
- "notes": "Llama-3.1: TGI OK, Neuron SDK OK, optimum-neuron KO"
294
- }
295
- ]
296
- },
297
- {
298
- "name": "Arcee-Agent",
299
- "modelType": "Qwen2 7B",
300
- "notes": "",
301
- "configurations": [
302
- {
303
- "region": "us-west-2",
304
- "instanceType": "g5.2xlarge",
305
- "cloud": "AWS",
306
- "gpu": "1xNVIDIA A10G",
307
- "gpuRAM": "24 GB",
308
- "quantization": "none",
309
- "tgi": "TGI 2.2.0",
310
- "status": "OK",
311
- "tokensPerSecond": "30"
312
- },
313
- {
314
- "region": "us-west-2",
315
- "instanceType": "g5.12xlarge",
316
- "cloud": "AWS",
317
- "gpu": "4xNVIDIA A10G",
318
- "gpuRAM": "96 GB",
319
- "quantization": "none",
320
- "tgi": "TGI 2.2.0",
321
- "status": "OK",
322
- "tokensPerSecond": "83"
323
- },
324
- {
325
- "region": "us-west-2",
326
- "instanceType": "g5.48xlarge",
327
- "cloud": "AWS",
328
- "gpu": "8xNVIDIA A10G",
329
- "gpuRAM": "192 GB",
330
- "quantization": "none",
331
- "tgi": "TGI 2.2.0",
332
- "status": "KO",
333
- "tokensPerSecond": "-",
334
- "notes": "ValueError: `num_heads` must be divisible by `num_shards` (got `num_heads`: 28 and `num_shards`: 8\n\nSM_NUM_GPUS=7 doesn't work either because tensor size ares not a multiple of 7 (e.g., 512)"
335
- },
336
- {
337
- "region": "us-west-2",
338
- "instanceType": "g6.2xlarge",
339
- "cloud": "AWS",
340
- "gpu": "1xNVIDIA L4",
341
- "gpuRAM": "24 GB",
342
- "quantization": "none",
343
- "tgi": "TGI 2.2.0",
344
- "status": "OK",
345
- "tokensPerSecond": "16.3"
346
- },
347
- {
348
- "region": "us-west-2",
349
- "instanceType": "g6.12xlarge",
350
- "cloud": "AWS",
351
- "gpu": "4xNVIDIA L4",
352
- "gpuRAM": "96 GB",
353
- "quantization": "none",
354
- "tgi": "TGI 2.2.0",
355
- "status": "OK",
356
- "tokensPerSecond": "54.2"
357
- },
358
- {
359
- "region": "us-west-2",
360
- "instanceType": "inf2.*",
361
- "cloud": "AWS",
362
- "gpu": "-",
363
- "tgi": "TGI 2.2.0",
364
- "status": "not supported",
365
- "tokensPerSecond": "-",
366
- "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO"
367
- }
368
- ]
369
- },
370
- {
371
- "name": "Arcee-Spark",
372
- "modelType": "Qwen2 7B"
373
- },
374
- {
375
- "name": "Arcee-Lite",
376
- "modelType": "Qwen2 1.5B distilled from phi-3-medium 14B",
377
- "configurations": [
378
- {
379
- "region": "us-west-2",
380
- "instanceType": "c6i.xlarge",
381
- "cloud": "AWS",
382
- "gpu": "-",
383
- "gpuRAM": "-",
384
- "quantization": "bitsandbytes-nf4",
385
- "tgi": "TGI 2.2.0",
386
- "status": "KO",
387
- "tokensPerSecond": "-",
388
- "notes": "OOM, might work with a prequantized model"
389
- },
390
- {
391
- "region": "us-west-2",
392
- "instanceType": "c6i.2xlarge",
393
- "cloud": "AWS",
394
- "gpu": "-",
395
- "gpuRAM": "-",
396
- "quantization": "bitsandbytes-nf4",
397
- "tgi": "TGI 2.2.0",
398
- "status": "KO",
399
- "tokensPerSecond": "-",
400
- "notes": "OOM, might work with a prequantized model"
401
- },
402
- {
403
- "region": "us-west-2",
404
- "instanceType": "c6i.4xlarge",
405
- "cloud": "AWS",
406
- "gpu": "-",
407
- "gpuRAM": "-",
408
  "configurations": [
409
- {
410
- "quantization": "none",
411
- "tgi": "TGI 2.2.0",
412
- "status": "OK",
413
- "tokensPerSecond": "10.7"
414
- },
415
- {
416
- "quantization": "bitsandbytes (int8)",
417
- "tgi": "TGI 2.2.0",
418
- "status": "OK",
419
- "tokensPerSecond": "10.5"
420
- },
421
- {
422
- "quantization": "bitsandbytes-nf4",
423
- "tgi": "TGI 2.2.0",
424
- "status": "OK",
425
- "tokensPerSecond": "10.6"
426
- }
427
- ]
428
- },
429
- {
430
- "region": "us-west-2",
431
- "instanceType": "c7i.4xlarge",
432
- "cloud": "AWS",
433
- "gpu": "-",
434
- "gpuRAM": "-",
435
- "quantization": "none",
436
- "tgi": "TGI 2.2.0",
437
- "status": "waiting for quota",
438
- "tokensPerSecond": "-"
439
- },
440
- {
441
- "region": "us-west-2",
442
- "instanceType": "g5.xlarge",
443
- "cloud": "AWS",
444
- "gpu": "1xNVIDIA A10G",
445
- "gpuRAM": "24 GB",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
  "configurations": [
447
- {
448
- "quantization": "none",
449
- "tgi": "TGI 2.2.0",
450
- "status": "OK",
451
- "tokensPerSecond": "110"
452
- },
453
- {
454
- "quantization": "none",
455
- "tgi": "DJL 0.28 vLLM",
456
- "status": "OK",
457
- "tokensPerSecond": "105",
458
- "notes": "\"OPTION_MAX_MODEL_LEN\": \"32768\","
459
- }
460
- ]
461
- },
462
- {
463
- "region": "us-west-2",
464
- "instanceType": "g6e.2xlarge",
465
- "cloud": "AWS",
466
- "gpu": "1xNVIDIA L40S",
467
- "gpuRAM": "48 GB",
468
- "quantization": "none",
469
- "tgi": "TGI 2.2.0",
470
- "status": "OK",
471
- "tokensPerSecond": "160"
472
- }
473
- ]
474
- },
475
- {
476
- "name": "Arcee-Scribe",
477
- "modelType": "InternLM2.5 8B",
478
- "configurations": [
479
- {
480
- "cloud": "us-west-2",
481
- "instanceType": "g5.2xlarge",
482
- "gpu": "1xNVIDIA A10G",
483
- "gpuRAM": "24 GB",
484
- "quantization": "none",
485
- "tgi": "DJL 0.28 vLLM",
486
- "status": "OK",
487
- "tokensPerSecond": 29,
488
- "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
489
- },
490
- {
491
- "cloud": "us-west-2",
492
- "instanceType": "g5.12xlarge",
493
- "gpu": "4xNVIDIA A10G",
494
- "gpuRAM": "96 GB",
495
- "quantization": "none",
496
- "tgi": "DJL 0.28 vLLM",
497
- "status": "OK",
498
- "tokensPerSecond": 65,
499
- "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",\nNot supported by AutoAWQ and AutoGPTQ'
500
- },
501
- {
502
- "cloud": "us-west-2",
503
- "instanceType": "g5.48xlarge",
504
- "gpu": "8xNVIDIA A10G",
505
- "gpuRAM": "192 GB",
506
- "quantization": "none",
507
- "tgi": "DJL 0.28 vLLM",
508
- "status": "OK",
509
- "tokensPerSecond": 80,
510
- "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
511
- },
512
- {
513
- "cloud": "us-west-2",
514
- "instanceType": "g6.2xlarge",
515
- "gpu": "1xNVIDIA L4",
516
- "gpuRAM": "24 GB",
517
- "quantization": "none",
518
- "tgi": "DJL 0.28 vLLM",
519
- "status": "OK",
520
- "tokensPerSecond": 16,
521
- "notes": '"OPTION_MAX_MODEL_LEN": "4096"'
522
- },
523
- {
524
- "cloud": "us-west-2",
525
- "instanceType": "g6.12xlarge",
526
- "gpu": "4xNVIDIA L4",
527
- "gpuRAM": "96 GB",
528
- "quantization": "none",
529
- "tgi": "DJL 0.28 vLLM",
530
- "status": "OK",
531
- "tokensPerSecond": 50,
532
- "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
533
- },
534
- {
535
- "cloud": "us-west-2",
536
- "instanceType": "g6.48xlarge",
537
- "gpu": "8xNVIDIA L4",
538
- "gpuRAM": "192 GB",
539
- "quantization": "none",
540
- "tgi": "DJL 0.28 vLLM",
541
- "status": "OK",
542
- "tokensPerSecond": 69,
543
- "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
544
- },
545
- {
546
- "cloud": "us-west-2",
547
- "instanceType": "p4d.24xlarge",
548
- "gpu": "4xNVIDIA A100",
549
- "gpuRAM": "320 GB",
550
- "quantization": "none",
551
- "tgi": "DJL 0.28 vLLM",
552
- "status": "OK",
553
- "tokensPerSecond": 82,
554
- "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
555
- }
556
- ]
557
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
558
  ]
559
- }
 
1
  results = {
2
  "models": [
3
+ {
4
+ "name": "Arcee-Meraj",
5
+ "modelType": "Qwen2 72B",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "configurations": [
7
+ {
8
+ "region": "us-west-2",
9
+ "instanceType": "g5.12xlarge",
10
+ "cloud": "AWS",
11
+ "gpu": "4xNVIDIA A10G",
12
+ "gpuRAM": "96 GB",
13
+ "quantization": "awq",
14
+ "tgi": "TGI 2.2.0",
15
+ "status": "OK",
16
+ "tokensPerSecond": "33",
17
+ "notes": "",
18
+ },
19
+ {
20
+ "region": "us-west-2",
21
+ "instanceType": "p4d.24xlarge",
22
+ "cloud": "AWS",
23
+ "gpu": "4xNVIDIA A100",
24
+ "gpuRAM": "320 GB",
25
+ "quantization": "none",
26
+ "tgi": "TGI 2.2.0",
27
+ "status": "OK",
28
+ "tokensPerSecond": "38",
29
+ "notes": "",
30
+ }
31
+ ],
32
+ },
33
+ {
34
+ "name": "Arcee-SuperNova",
35
+ "modelType": "Llama 3.1 70B",
 
 
 
 
 
36
  "configurations": [
37
+ {
38
+ "region": "us-west-2",
39
+ "instanceType": "g5.12xlarge",
40
+ "cloud": "AWS",
41
+ "gpu": "4xNVIDIA A10G",
42
+ "gpuRAM": "96 GB",
43
+ "quantization": "awq",
44
+ "tgi": "TGI 2.2.0",
45
+ "status": "OK",
46
+ "tokensPerSecond": "33",
47
+ "notes": "",
48
+ },
49
+ {
50
+ "region": "us-west-2",
51
+ "instanceType": "p4d.24xlarge",
52
+ "cloud": "AWS",
53
+ "gpu": "4xNVIDIA A100",
54
+ "gpuRAM": "320 GB",
55
+ "quantization": "none",
56
+ "tgi": "TGI 2.2.0",
57
+ "status": "OK",
58
+ "tokensPerSecond": "38",
59
+ "notes": "",
60
+ }
61
+ ],
62
+ },
63
+ {
64
+ "name": "Arcee-Nova",
65
+ "modelType": "Qwen2 72B",
66
+ "notes": "",
 
 
 
 
67
  "configurations": [
68
+ {
69
+ "region": "us-west-2",
70
+ "instanceType": "g4dn.12xlarge",
71
+ "cloud": "AWS",
72
+ "gpu": "4xNVIDIA T4",
73
+ "gpuRAM": "64 GB",
74
+ "quantization": "bitsandbytes-nf4",
75
+ "tgi": "TGI 2.2.0",
76
+ "status": "KO",
77
+ "tokensPerSecond": "-",
78
+ "notes": "Flash Attention requires Ampere GPUs or newer",
79
+ },
80
+ {
81
+ "region": "us-west-2",
82
+ "instanceType": "g5.12xlarge",
83
+ "cloud": "AWS",
84
+ "gpu": "4xNVIDIA A10G",
85
+ "gpuRAM": "96 GB",
86
+ "configurations": [
87
+ {
88
+ "quantization": "bitsandbytes-nf4",
89
+ "tgi": "TGI 2.2.0",
90
+ "status": "OK",
91
+ "tokensPerSecond": "12",
92
+ },
93
+ {
94
+ "quantization": "bitsandbytes-fp4",
95
+ "tgi": "TGI 2.2.0",
96
+ "status": "OK",
97
+ "tokensPerSecond": "12",
98
+ },
99
+ {
100
+ "quantization": "bitsandbytes (int8)",
101
+ "tgi": "TGI 2.2.0",
102
+ "status": "KO",
103
+ "tokensPerSecond": "-",
104
+ "notes": "CUDA OOM",
105
+ },
106
+ {
107
+ "quantization": "eetq (int8)",
108
+ "tgi": "TGI 2.2.0",
109
+ "status": "KO",
110
+ "tokensPerSecond": "-",
111
+ "notes": "[FT Error] Heurisitc failed to find a valid config.",
112
+ },
113
+ ],
114
+ },
115
+ {
116
+ "region": "us-west-2",
117
+ "instanceType": "g5.48xlarge",
118
+ "cloud": "AWS",
119
+ "gpu": "8xNVIDIA A10G",
120
+ "gpuRAM": "192 GB",
121
+ "configurations": [
122
+ {
123
+ "quantization": "none",
124
+ "tgi": "TGI 2.2.0",
125
+ "status": "KO",
126
+ "tokensPerSecond": "-",
127
+ "notes": "CUDA OOM (but g6.48xlarge works!)",
128
+ },
129
+ {
130
+ "quantization": "bitsandbytes-nf4",
131
+ "tgi": "TGI 2.2.0",
132
+ "status": "OK",
133
+ "tokensPerSecond": "12.3",
134
+ },
135
+ {
136
+ "quantization": "bitsandbytes-fp4",
137
+ "tgi": "TGI 2.2.0",
138
+ "status": "OK",
139
+ "tokensPerSecond": "12.5",
140
+ },
141
+ {
142
+ "quantization": "bitsandbytes (int8)",
143
+ "tgi": "TGI 2.2.0",
144
+ "status": "KO",
145
+ "tokensPerSecond": "-",
146
+ "notes": "The model deploys, but inference times out.",
147
+ },
148
+ ],
149
+ },
150
+ {
151
+ "region": "us-west-2",
152
+ "instanceType": "g6.12xlarge",
153
+ "cloud": "AWS",
154
+ "gpu": "4xNVIDIA L4",
155
+ "gpuRAM": "96 GB",
156
+ "configurations": [
157
+ {
158
+ "quantization": "bitsandbytes-nf4",
159
+ "tgi": "TGI 2.2.0",
160
+ "status": "OK",
161
+ "tokensPerSecond": "1.5-2",
162
+ "notes": "Too slow, timeouts are likely",
163
+ },
164
+ {
165
+ "quantization": "bitsandbytes-fp4",
166
+ "tgi": "TGI 2.2.0",
167
+ "status": "OK",
168
+ "tokensPerSecond": "2",
169
+ "notes": "Too slow, timeouts are likely",
170
+ },
171
+ {
172
+ "quantization": "bitsandbytes (int8)",
173
+ "tgi": "TGI 2.2.0",
174
+ "status": "KO",
175
+ "tokensPerSecond": "-",
176
+ "notes": "CUDA OOM",
177
+ },
178
+ ],
179
+ },
180
+ {
181
+ "region": "us-west-2",
182
+ "instanceType": "g6.48xlarge",
183
+ "cloud": "AWS",
184
+ "gpu": "8xNVIDIA L4",
185
+ "gpuRAM": "192 GB",
186
+ "quantization": "none",
187
+ "tgi": "TGI 2.2.0",
188
+ "status": "OK",
189
+ "tokensPerSecond": "12",
190
+ },
191
+ {
192
+ "region": "us-west-2",
193
+ "instanceType": "p4d.24xlarge",
194
+ "cloud": "AWS",
195
+ "gpu": "8xNVIDIA A100",
196
+ "gpuRAM": "320 GB",
197
+ "quantization": "none",
198
+ "tgi": "TGI 2.2.0",
199
+ "status": "OK",
200
+ "tokensPerSecond": "40",
201
+ "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
202
+ },
203
+ {
204
+ "region": "us-west-2",
205
+ "instanceType": "p4de.24xlarge",
206
+ "cloud": "AWS",
207
+ "gpu": "8xNVIDIA A100",
208
+ "gpuRAM": "320 GB",
209
+ "quantization": "none",
210
+ "tgi": "TGI 2.2.0",
211
+ "status": "waiting for quota",
212
+ },
213
+ {
214
+ "region": "us-west-2",
215
+ "instanceType": "p5.48xlarge",
216
+ "cloud": "AWS",
217
+ "gpu": "8xNVIDIA H100",
218
+ "gpuRAM": "640GB",
219
+ "quantization": "none",
220
+ "tgi": "TGI 2.2.0",
221
+ "status": "OK",
222
+ "tokensPerSecond": "58",
223
+ "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
224
+ },
225
+ {
226
+ "region": "us-west-2",
227
+ "instanceType": "inf2.*",
228
+ "cloud": "AWS",
229
+ "gpu": "-",
230
+ "tgi": "TGI 2.2.0",
231
+ "status": "not supported",
232
+ "tokensPerSecond": "-",
233
+ "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
234
+ },
235
+ ],
236
+ },
237
+ {
238
+ "name": "Llama-Spark",
239
+ "modelType": "Llama 3.1 8B",
240
  "configurations": [
241
+ {
242
+ "region": "us-west-2",
243
+ "instanceType": "g5.2xlarge",
244
+ "cloud": "AWS",
245
+ "gpu": "1xNVIDIA A10G",
246
+ "gpuRAM": "24 GB",
247
+ "quantization": "none",
248
+ "tgi": "TGI 2.2.0",
249
+ "status": "OK",
250
+ "tokensPerSecond": "29",
251
+ "notes": "4K/8K fails",
252
+ },
253
+ {
254
+ "region": "us-west-2",
255
+ "instanceType": "g5.12xlarge",
256
+ "cloud": "AWS",
257
+ "gpu": "4xNVIDIA A10G",
258
+ "gpuRAM": "96 GB",
259
+ "quantization": "none",
260
+ "tgi": "TGI 2.2.0",
261
+ "status": "OK",
262
+ "tokensPerSecond": "85",
263
+ "notes": '"MAX_INPUT_TOKENS": "16384", "MAX_TOTAL_TOKENS": "32768",',
264
+ },
265
+ {
266
+ "region": "us-west-2",
267
+ "instanceType": "g5.48xlarge",
268
+ "cloud": "AWS",
269
+ "gpu": "8xNVIDIA A10G",
270
+ "gpuRAM": "192 GB",
271
+ "quantization": "none",
272
+ "tgi": "TGI 2.2.0",
273
+ "status": "OK",
274
+ "tokensPerSecond": "105",
275
+ "notes": '"MAX_INPUT_TOKENS": "20480", "MAX_TOTAL_TOKENS": "40960"\n\n32K/64K fails',
276
+ },
277
+ {
278
+ "region": "us-west-2",
279
+ "instanceType": "g6.2xlarge",
280
+ "cloud": "AWS",
281
+ "gpu": "1xNVIDIA L4",
282
+ "gpuRAM": "24 GB",
283
+ "configurations": [
284
+ {
285
+ "quantization": "none",
286
+ "tgi": "TGI 2.2.0",
287
+ "status": "OK",
288
+ "tokensPerSecond": "15",
289
+ },
290
+ {"quantization": "fp8", "tgi": "TGI 2.2.0"},
291
+ ],
292
+ },
293
+ {
294
+ "region": "us-west-2",
295
+ "instanceType": "g6.12xlarge",
296
+ "cloud": "AWS",
297
+ "gpu": "4xNVIDIA L4",
298
+ "gpuRAM": "96 GB",
299
+ "quantization": "none",
300
+ "tgi": "TGI 2.2.0",
301
+ "status": "OK",
302
+ "tokensPerSecond": "51",
303
+ "notes": "same as g5?",
304
+ },
305
+ {
306
+ "region": "us-west-2",
307
+ "instanceType": "g6.48xlarge",
308
+ "cloud": "AWS",
309
+ "gpu": "8xNVIDIA L4",
310
+ "gpuRAM": "192 GB",
311
+ "quantization": "none",
312
+ "tgi": "TGI 2.2.0",
313
+ "status": "OK",
314
+ "tokensPerSecond": "81",
315
+ "notes": "same as g5?",
316
+ },
317
+ {
318
+ "region": "us-west-2",
319
+ "instanceType": "g6e.2xlarge",
320
+ "cloud": "AWS",
321
+ "gpu": "1xNVIDIA L40S",
322
+ "gpuRAM": "48 GB",
323
+ "quantization": "none",
324
+ "tgi": "TGI 2.2.0",
325
+ "status": "OK",
326
+ "tokensPerSecond": "42",
327
+ },
328
+ {
329
+ "region": "us-west-2",
330
+ "instanceType": "p4d.24xlarge",
331
+ "cloud": "AWS",
332
+ "gpu": "4xNVIDIA A100",
333
+ "gpuRAM": "320 GB",
334
+ "quantization": "none",
335
+ "tgi": "TGI 2.2.0",
336
+ "status": "OK",
337
+ "tokensPerSecond": "145",
338
+ "notes": '"MAX_INPUT_TOKENS": "40960", "MAX_TOTAL_TOKENS": "81920"\n\n64K/128K fails (even with 4-bit)',
339
+ },
340
+ {
341
+ "region": "us-west-2",
342
+ "instanceType": "inf2.*",
343
+ "cloud": "AWS",
344
+ "gpu": "-",
345
+ "status": "not supported",
346
+ "tokensPerSecond": "-",
347
+ "notes": "Llama-3.1: TGI OK, Neuron SDK OK, optimum-neuron KO",
348
+ },
349
+ ],
350
+ },
351
+ {
352
+ "name": "Arcee-Agent",
353
+ "modelType": "Qwen2 7B",
354
+ "notes": "",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  "configurations": [
356
+ {
357
+ "region": "us-west-2",
358
+ "instanceType": "g5.2xlarge",
359
+ "cloud": "AWS",
360
+ "gpu": "1xNVIDIA A10G",
361
+ "gpuRAM": "24 GB",
362
+ "quantization": "none",
363
+ "tgi": "TGI 2.2.0",
364
+ "status": "OK",
365
+ "tokensPerSecond": "30",
366
+ },
367
+ {
368
+ "region": "us-west-2",
369
+ "instanceType": "g5.12xlarge",
370
+ "cloud": "AWS",
371
+ "gpu": "4xNVIDIA A10G",
372
+ "gpuRAM": "96 GB",
373
+ "quantization": "none",
374
+ "tgi": "TGI 2.2.0",
375
+ "status": "OK",
376
+ "tokensPerSecond": "83",
377
+ },
378
+ {
379
+ "region": "us-west-2",
380
+ "instanceType": "g5.48xlarge",
381
+ "cloud": "AWS",
382
+ "gpu": "8xNVIDIA A10G",
383
+ "gpuRAM": "192 GB",
384
+ "quantization": "none",
385
+ "tgi": "TGI 2.2.0",
386
+ "status": "KO",
387
+ "tokensPerSecond": "-",
388
+ "notes": "ValueError: `num_heads` must be divisible by `num_shards` (got `num_heads`: 28 and `num_shards`: 8\n\nSM_NUM_GPUS=7 doesn't work either because tensor size ares not a multiple of 7 (e.g., 512)",
389
+ },
390
+ {
391
+ "region": "us-west-2",
392
+ "instanceType": "g6.2xlarge",
393
+ "cloud": "AWS",
394
+ "gpu": "1xNVIDIA L4",
395
+ "gpuRAM": "24 GB",
396
+ "quantization": "none",
397
+ "tgi": "TGI 2.2.0",
398
+ "status": "OK",
399
+ "tokensPerSecond": "16.3",
400
+ },
401
+ {
402
+ "region": "us-west-2",
403
+ "instanceType": "g6.12xlarge",
404
+ "cloud": "AWS",
405
+ "gpu": "4xNVIDIA L4",
406
+ "gpuRAM": "96 GB",
407
+ "quantization": "none",
408
+ "tgi": "TGI 2.2.0",
409
+ "status": "OK",
410
+ "tokensPerSecond": "54.2",
411
+ },
412
+ {
413
+ "region": "us-west-2",
414
+ "instanceType": "inf2.*",
415
+ "cloud": "AWS",
416
+ "gpu": "-",
417
+ "tgi": "TGI 2.2.0",
418
+ "status": "not supported",
419
+ "tokensPerSecond": "-",
420
+ "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
421
+ },
422
+ ],
423
+ },
424
+ {"name": "Arcee-Spark", "modelType": "Qwen2 7B"},
425
+ {
426
+ "name": "Arcee-Lite",
427
+ "modelType": "Qwen2 1.5B distilled from phi-3-medium 14B",
428
  "configurations": [
429
+ {
430
+ "region": "us-west-2",
431
+ "instanceType": "c6i.xlarge",
432
+ "cloud": "AWS",
433
+ "gpu": "-",
434
+ "gpuRAM": "-",
435
+ "quantization": "bitsandbytes-nf4",
436
+ "tgi": "TGI 2.2.0",
437
+ "status": "KO",
438
+ "tokensPerSecond": "-",
439
+ "notes": "OOM, might work with a prequantized model",
440
+ },
441
+ {
442
+ "region": "us-west-2",
443
+ "instanceType": "c6i.2xlarge",
444
+ "cloud": "AWS",
445
+ "gpu": "-",
446
+ "gpuRAM": "-",
447
+ "quantization": "bitsandbytes-nf4",
448
+ "tgi": "TGI 2.2.0",
449
+ "status": "KO",
450
+ "tokensPerSecond": "-",
451
+ "notes": "OOM, might work with a prequantized model",
452
+ },
453
+ {
454
+ "region": "us-west-2",
455
+ "instanceType": "c6i.4xlarge",
456
+ "cloud": "AWS",
457
+ "gpu": "-",
458
+ "gpuRAM": "-",
459
+ "configurations": [
460
+ {
461
+ "quantization": "none",
462
+ "tgi": "TGI 2.2.0",
463
+ "status": "OK",
464
+ "tokensPerSecond": "10.7",
465
+ },
466
+ {
467
+ "quantization": "bitsandbytes (int8)",
468
+ "tgi": "TGI 2.2.0",
469
+ "status": "OK",
470
+ "tokensPerSecond": "10.5",
471
+ },
472
+ {
473
+ "quantization": "bitsandbytes-nf4",
474
+ "tgi": "TGI 2.2.0",
475
+ "status": "OK",
476
+ "tokensPerSecond": "10.6",
477
+ },
478
+ ],
479
+ },
480
+ {
481
+ "region": "us-west-2",
482
+ "instanceType": "c7i.4xlarge",
483
+ "cloud": "AWS",
484
+ "gpu": "-",
485
+ "gpuRAM": "-",
486
+ "quantization": "none",
487
+ "tgi": "TGI 2.2.0",
488
+ "status": "waiting for quota",
489
+ "tokensPerSecond": "-",
490
+ },
491
+ {
492
+ "region": "us-west-2",
493
+ "instanceType": "g5.xlarge",
494
+ "cloud": "AWS",
495
+ "gpu": "1xNVIDIA A10G",
496
+ "gpuRAM": "24 GB",
497
+ "configurations": [
498
+ {
499
+ "quantization": "none",
500
+ "tgi": "TGI 2.2.0",
501
+ "status": "OK",
502
+ "tokensPerSecond": "110",
503
+ },
504
+ {
505
+ "quantization": "none",
506
+ "tgi": "DJL 0.28 vLLM",
507
+ "status": "OK",
508
+ "tokensPerSecond": "105",
509
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",',
510
+ },
511
+ ],
512
+ },
513
+ {
514
+ "region": "us-west-2",
515
+ "instanceType": "g6e.2xlarge",
516
+ "cloud": "AWS",
517
+ "gpu": "1xNVIDIA L40S",
518
+ "gpuRAM": "48 GB",
519
+ "quantization": "none",
520
+ "tgi": "TGI 2.2.0",
521
+ "status": "OK",
522
+ "tokensPerSecond": "160",
523
+ },
524
+ ],
525
+ },
526
+ {
527
+ "name": "Arcee-Scribe",
528
+ "modelType": "InternLM2.5 8B",
529
+ "configurations": [
530
+ {
531
+ "cloud": "us-west-2",
532
+ "instanceType": "g5.2xlarge",
533
+ "gpu": "1xNVIDIA A10G",
534
+ "gpuRAM": "24 GB",
535
+ "quantization": "none",
536
+ "tgi": "DJL 0.28 vLLM",
537
+ "status": "OK",
538
+ "tokensPerSecond": 29,
539
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
540
+ },
541
+ {
542
+ "cloud": "us-west-2",
543
+ "instanceType": "g5.12xlarge",
544
+ "gpu": "4xNVIDIA A10G",
545
+ "gpuRAM": "96 GB",
546
+ "quantization": "none",
547
+ "tgi": "DJL 0.28 vLLM",
548
+ "status": "OK",
549
+ "tokensPerSecond": 65,
550
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",\nNot supported by AutoAWQ and AutoGPTQ',
551
+ },
552
+ {
553
+ "cloud": "us-west-2",
554
+ "instanceType": "g5.48xlarge",
555
+ "gpu": "8xNVIDIA A10G",
556
+ "gpuRAM": "192 GB",
557
+ "quantization": "none",
558
+ "tgi": "DJL 0.28 vLLM",
559
+ "status": "OK",
560
+ "tokensPerSecond": 80,
561
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
562
+ },
563
+ {
564
+ "cloud": "us-west-2",
565
+ "instanceType": "g6.2xlarge",
566
+ "gpu": "1xNVIDIA L4",
567
+ "gpuRAM": "24 GB",
568
+ "quantization": "none",
569
+ "tgi": "DJL 0.28 vLLM",
570
+ "status": "OK",
571
+ "tokensPerSecond": 16,
572
+ "notes": '"OPTION_MAX_MODEL_LEN": "4096"',
573
+ },
574
+ {
575
+ "cloud": "us-west-2",
576
+ "instanceType": "g6.12xlarge",
577
+ "gpu": "4xNVIDIA L4",
578
+ "gpuRAM": "96 GB",
579
+ "quantization": "none",
580
+ "tgi": "DJL 0.28 vLLM",
581
+ "status": "OK",
582
+ "tokensPerSecond": 50,
583
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
584
+ },
585
+ {
586
+ "cloud": "us-west-2",
587
+ "instanceType": "g6.48xlarge",
588
+ "gpu": "8xNVIDIA L4",
589
+ "gpuRAM": "192 GB",
590
+ "quantization": "none",
591
+ "tgi": "DJL 0.28 vLLM",
592
+ "status": "OK",
593
+ "tokensPerSecond": 69,
594
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
595
+ },
596
+ {
597
+ "cloud": "us-west-2",
598
+ "instanceType": "p4d.24xlarge",
599
+ "gpu": "4xNVIDIA A100",
600
+ "gpuRAM": "320 GB",
601
+ "quantization": "none",
602
+ "tgi": "DJL 0.28 vLLM",
603
+ "status": "OK",
604
+ "tokensPerSecond": 82,
605
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
606
+ },
607
+ ],
608
+ },
609
  ]
610
+ }