Spaces:

arcee-ai
/

Benchmarks

Running

App Files Files Community

Julien Simon commited on Sep 3, 2024

Commit

efaad9e

•

1 Parent(s): 7200b01

- Add Meraj and SuperNova

Browse files

Files changed (1) hide show

results.py +601 -550

results.py CHANGED Viewed

@@ -1,559 +1,610 @@
 results = {
     "models": [
-      {"name": "Arcee-Meraj",
-       "modelType": "Qwen2 72B"
-      },
-      {
-        "name": "Arcee-Nova",
-        "modelType": "Qwen2 72B",
-        "notes": "",
-        "configurations": [
-          {
-            "region": "us-west-2",
-            "instanceType": "g4dn.12xlarge",
-            "cloud": "AWS",
-            "gpu": "4xNVIDIA T4",
-            "gpuRAM": "64 GB",
-            "quantization": "bitsandbytes-nf4",
-            "tgi": "TGI 2.2.0",
-            "status": "KO",
-            "tokensPerSecond": "-",
-            "notes": "Flash Attention requires Ampere GPUs or newer"
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "g5.12xlarge",
-            "cloud": "AWS",
-            "gpu": "4xNVIDIA A10G",
-            "gpuRAM": "96 GB",
             "configurations": [
-              {
-                "quantization": "bitsandbytes-nf4",
-                "tgi": "TGI 2.2.0",
-                "status": "OK",
-                "tokensPerSecond": "12"
-              },
-              {
-                "quantization": "bitsandbytes-fp4",
-                "tgi": "TGI 2.2.0",
-                "status": "OK",
-                "tokensPerSecond": "12"
-              },
-              {
-                "quantization": "bitsandbytes (int8)",
-                "tgi": "TGI 2.2.0",
-                "status": "KO",
-                "tokensPerSecond": "-",
-                "notes": "CUDA OOM"
-              },
-              {
-                "quantization": "eetq (int8)",
-                "tgi": "TGI 2.2.0",
-                "status": "KO",
-                "tokensPerSecond": "-",
-                "notes": "[FT Error] Heurisitc failed to find a valid config."
-              }
-            ]
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "g5.48xlarge",
-            "cloud": "AWS",
-            "gpu": "8xNVIDIA A10G",
-            "gpuRAM": "192 GB",
             "configurations": [
-              {
-                "quantization": "none",
-                "tgi": "TGI 2.2.0",
-                "status": "KO",
-                "tokensPerSecond": "-",
-                "notes": "CUDA OOM (but g6.48xlarge works!)"
-              },
-              {
-                "quantization": "bitsandbytes-nf4",
-                "tgi": "TGI 2.2.0",
-                "status": "OK",
-                "tokensPerSecond": "12.3"
-              },
-              {
-                "quantization": "bitsandbytes-fp4",
-                "tgi": "TGI 2.2.0",
-                "status": "OK",
-                "tokensPerSecond": "12.5"
-              },
-              {
-                "quantization": "bitsandbytes (int8)",
-                "tgi": "TGI 2.2.0",
-                "status": "KO",
-                "tokensPerSecond": "-",
-                "notes": "The model deploys, but inference times out."
-              }
-            ]
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "g6.12xlarge",
-            "cloud": "AWS",
-            "gpu": "4xNVIDIA L4",
-            "gpuRAM": "96 GB",
             "configurations": [
-              {
-                "quantization": "bitsandbytes-nf4",
-                "tgi": "TGI 2.2.0",
-                "status": "OK",
-                "tokensPerSecond": "1.5-2",
-                "notes": "Too slow, timeouts are likely"
-              },
-              {
-                "quantization": "bitsandbytes-fp4",
-                "tgi": "TGI 2.2.0",
-                "status": "OK",
-                "tokensPerSecond": "2",
-                "notes": "Too slow, timeouts are likely"
-              },
-              {
-                "quantization": "bitsandbytes (int8)",
-                "tgi": "TGI 2.2.0",
-                "status": "KO",
-                "tokensPerSecond": "-",
-                "notes": "CUDA OOM"
-              }
-            ]
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "g6.48xlarge",
-            "cloud": "AWS",
-            "gpu": "8xNVIDIA L4",
-            "gpuRAM": "192 GB",
-            "quantization": "none",
-            "tgi": "TGI 2.2.0",
-            "status": "OK",
-            "tokensPerSecond": "12"
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "p4d.24xlarge",
-            "cloud": "AWS",
-            "gpu": "8xNVIDIA A100",
-            "gpuRAM": "320 GB",
-            "quantization": "none",
-            "tgi": "TGI 2.2.0",
-            "status": "OK",
-            "tokensPerSecond": "40",
-            "notes": "\"MAX_INPUT_LENGTH\": \"16384\", \"MAX_TOTAL_TOKENS\": \"32768\","
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "p4de.24xlarge",
-            "cloud": "AWS",
-            "gpu": "8xNVIDIA A100",
-            "gpuRAM": "320 GB",
-            "quantization": "none",
-            "tgi": "TGI 2.2.0",
-            "status": "waiting for quota"
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "p5.48xlarge",
-            "cloud": "AWS",
-            "gpu": "8xNVIDIA H100",
-            "gpuRAM": "640GB",
-            "quantization": "none",
-            "tgi": "TGI 2.2.0",
-            "status": "OK",
-            "tokensPerSecond": "58",
-            "notes": "\"MAX_INPUT_LENGTH\": \"16384\", \"MAX_TOTAL_TOKENS\": \"32768\","
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "inf2.*",
-            "cloud": "AWS",
-            "gpu": "-",
-            "tgi": "TGI 2.2.0",
-            "status": "not supported",
-            "tokensPerSecond": "-",
-            "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO"
-          }
-        ]
-      },
-      {
-        "name": "Llama-Spark",
-        "modelType": "Llama 3.1 8B",
-        "configurations": [
-          {
-            "region": "us-west-2",
-            "instanceType": "g5.2xlarge",
-            "cloud": "AWS",
-            "gpu": "1xNVIDIA A10G",
-            "gpuRAM": "24 GB",
-            "quantization": "none",
-            "tgi": "TGI 2.2.0",
-            "status": "OK",
-            "tokensPerSecond": "29",
-            "notes": "4K/8K fails"
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "g5.12xlarge",
-            "cloud": "AWS",
-            "gpu": "4xNVIDIA A10G",
-            "gpuRAM": "96 GB",
-            "quantization": "none",
-            "tgi": "TGI 2.2.0",
-            "status": "OK",
-            "tokensPerSecond": "85",
-            "notes": "\"MAX_INPUT_TOKENS\": \"16384\", \"MAX_TOTAL_TOKENS\": \"32768\","
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "g5.48xlarge",
-            "cloud": "AWS",
-            "gpu": "8xNVIDIA A10G",
-            "gpuRAM": "192 GB",
-            "quantization": "none",
-            "tgi": "TGI 2.2.0",
-            "status": "OK",
-            "tokensPerSecond": "105",
-            "notes": "\"MAX_INPUT_TOKENS\": \"20480\", \"MAX_TOTAL_TOKENS\": \"40960\"\n\n32K/64K fails"
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "g6.2xlarge",
-            "cloud": "AWS",
-            "gpu": "1xNVIDIA L4",
-            "gpuRAM": "24 GB",
             "configurations": [
-              {
-                "quantization": "none",
-                "tgi": "TGI 2.2.0",
-                "status": "OK",
-                "tokensPerSecond": "15"
-              },
-              {
-                "quantization": "fp8",
-                "tgi": "TGI 2.2.0"
-              }
-            ]
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "g6.12xlarge",
-            "cloud": "AWS",
-            "gpu": "4xNVIDIA L4",
-            "gpuRAM": "96 GB",
-            "quantization": "none",
-            "tgi": "TGI 2.2.0",
-            "status": "OK",
-            "tokensPerSecond": "51",
-            "notes": "same as g5?"
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "g6.48xlarge",
-            "cloud": "AWS",
-            "gpu": "8xNVIDIA L4",
-            "gpuRAM": "192 GB",
-            "quantization": "none",
-            "tgi": "TGI 2.2.0",
-            "status": "OK",
-            "tokensPerSecond": "81",
-            "notes": "same as g5?"
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "g6e.2xlarge",
-            "cloud": "AWS",
-            "gpu": "1xNVIDIA L40S",
-            "gpuRAM": "48 GB",
-            "quantization": "none",
-            "tgi": "TGI 2.2.0",
-            "status": "OK",
-            "tokensPerSecond": "42"
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "p4d.24xlarge",
-            "cloud": "AWS",
-            "gpu": "4xNVIDIA A100",
-            "gpuRAM": "320 GB",
-            "quantization": "none",
-            "tgi": "TGI 2.2.0",
-            "status": "OK",
-            "tokensPerSecond": "145",
-            "notes": "\"MAX_INPUT_TOKENS\": \"40960\", \"MAX_TOTAL_TOKENS\": \"81920\"\n\n64K/128K fails (even with 4-bit)"
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "inf2.*",
-            "cloud": "AWS",
-            "gpu": "-",
-            "status": "not supported",
-            "tokensPerSecond": "-",
-            "notes": "Llama-3.1: TGI OK, Neuron SDK OK, optimum-neuron KO"
-          }
-        ]
-      },
-      {
-        "name": "Arcee-Agent",
-        "modelType": "Qwen2 7B",
-        "notes": "",
-        "configurations": [
-          {
-            "region": "us-west-2",
-            "instanceType": "g5.2xlarge",
-            "cloud": "AWS",
-            "gpu": "1xNVIDIA A10G",
-            "gpuRAM": "24 GB",
-            "quantization": "none",
-            "tgi": "TGI 2.2.0",
-            "status": "OK",
-            "tokensPerSecond": "30"
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "g5.12xlarge",
-            "cloud": "AWS",
-            "gpu": "4xNVIDIA A10G",
-            "gpuRAM": "96 GB",
-            "quantization": "none",
-            "tgi": "TGI 2.2.0",
-            "status": "OK",
-            "tokensPerSecond": "83"
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "g5.48xlarge",
-            "cloud": "AWS",
-            "gpu": "8xNVIDIA A10G",
-            "gpuRAM": "192 GB",
-            "quantization": "none",
-            "tgi": "TGI 2.2.0",
-            "status": "KO",
-            "tokensPerSecond": "-",
-            "notes": "ValueError: `num_heads` must be divisible by `num_shards` (got `num_heads`: 28 and `num_shards`: 8\n\nSM_NUM_GPUS=7 doesn't work either because tensor size ares not a multiple of 7 (e.g., 512)"
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "g6.2xlarge",
-            "cloud": "AWS",
-            "gpu": "1xNVIDIA L4",
-            "gpuRAM": "24 GB",
-            "quantization": "none",
-            "tgi": "TGI 2.2.0",
-            "status": "OK",
-            "tokensPerSecond": "16.3"
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "g6.12xlarge",
-            "cloud": "AWS",
-            "gpu": "4xNVIDIA L4",
-            "gpuRAM": "96 GB",
-            "quantization": "none",
-            "tgi": "TGI 2.2.0",
-            "status": "OK",
-            "tokensPerSecond": "54.2"
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "inf2.*",
-            "cloud": "AWS",
-            "gpu": "-",
-            "tgi": "TGI 2.2.0",
-            "status": "not supported",
-            "tokensPerSecond": "-",
-            "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO"
-          }
-        ]
-      },
-      {
-        "name": "Arcee-Spark",
-        "modelType": "Qwen2 7B"
-      },
-      {
-        "name": "Arcee-Lite",
-        "modelType": "Qwen2 1.5B distilled from phi-3-medium 14B",
-        "configurations": [
-          {
-            "region": "us-west-2",
-            "instanceType": "c6i.xlarge",
-            "cloud": "AWS",
-            "gpu": "-",
-            "gpuRAM": "-",
-            "quantization": "bitsandbytes-nf4",
-            "tgi": "TGI 2.2.0",
-            "status": "KO",
-            "tokensPerSecond": "-",
-            "notes": "OOM, might work with a prequantized model"
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "c6i.2xlarge",
-            "cloud": "AWS",
-            "gpu": "-",
-            "gpuRAM": "-",
-            "quantization": "bitsandbytes-nf4",
-            "tgi": "TGI 2.2.0",
-            "status": "KO",
-            "tokensPerSecond": "-",
-            "notes": "OOM, might work with a prequantized model"
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "c6i.4xlarge",
-            "cloud": "AWS",
-            "gpu": "-",
-            "gpuRAM": "-",
             "configurations": [
-              {
-                "quantization": "none",
-                "tgi": "TGI 2.2.0",
-                "status": "OK",
-                "tokensPerSecond": "10.7"
-              },
-              {
-                "quantization": "bitsandbytes (int8)",
-                "tgi": "TGI 2.2.0",
-                "status": "OK",
-                "tokensPerSecond": "10.5"
-              },
-              {
-                "quantization": "bitsandbytes-nf4",
-                "tgi": "TGI 2.2.0",
-                "status": "OK",
-                "tokensPerSecond": "10.6"
-              }
-            ]
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "c7i.4xlarge",
-            "cloud": "AWS",
-            "gpu": "-",
-            "gpuRAM": "-",
-            "quantization": "none",
-            "tgi": "TGI 2.2.0",
-            "status": "waiting for quota",
-            "tokensPerSecond": "-"
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "g5.xlarge",
-            "cloud": "AWS",
-            "gpu": "1xNVIDIA A10G",
-            "gpuRAM": "24 GB",
             "configurations": [
-              {
-                "quantization": "none",
-                "tgi": "TGI 2.2.0",
-                "status": "OK",
-                "tokensPerSecond": "110"
-              },
-              {
-                "quantization": "none",
-                "tgi": "DJL 0.28 vLLM",
-                "status": "OK",
-                "tokensPerSecond": "105",
-                "notes": "\"OPTION_MAX_MODEL_LEN\": \"32768\","
-              }
-            ]
-          },
-          {
-            "region": "us-west-2",
-            "instanceType": "g6e.2xlarge",
-            "cloud": "AWS",
-            "gpu": "1xNVIDIA L40S",
-            "gpuRAM": "48 GB",
-            "quantization": "none",
-            "tgi": "TGI 2.2.0",
-            "status": "OK",
-            "tokensPerSecond": "160"
-          }
-        ]
-      },
-      {
-        "name": "Arcee-Scribe",
-        "modelType": "InternLM2.5 8B",
-        "configurations": [
-          {
-            "cloud": "us-west-2",
-            "instanceType": "g5.2xlarge",
-            "gpu": "1xNVIDIA A10G",
-            "gpuRAM": "24 GB",
-            "quantization": "none",
-            "tgi": "DJL 0.28 vLLM",
-            "status": "OK",
-            "tokensPerSecond": 29,
-            "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
-          },
-          {
-            "cloud": "us-west-2",
-            "instanceType": "g5.12xlarge",
-            "gpu": "4xNVIDIA A10G",
-            "gpuRAM": "96 GB",
-            "quantization": "none",
-            "tgi": "DJL 0.28 vLLM",
-            "status": "OK",
-            "tokensPerSecond": 65,
-            "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",\nNot supported by AutoAWQ and AutoGPTQ'
-          },
-          {
-            "cloud": "us-west-2",
-            "instanceType": "g5.48xlarge",
-            "gpu": "8xNVIDIA A10G",
-            "gpuRAM": "192 GB",
-            "quantization": "none",
-            "tgi": "DJL 0.28 vLLM",
-            "status": "OK",
-            "tokensPerSecond": 80,
-            "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
-          },
-          {
-            "cloud": "us-west-2",
-            "instanceType": "g6.2xlarge",
-            "gpu": "1xNVIDIA L4",
-            "gpuRAM": "24 GB",
-            "quantization": "none",
-            "tgi": "DJL 0.28 vLLM",
-            "status": "OK",
-            "tokensPerSecond": 16,
-            "notes": '"OPTION_MAX_MODEL_LEN": "4096"'
-          },
-          {
-            "cloud": "us-west-2",
-            "instanceType": "g6.12xlarge",
-            "gpu": "4xNVIDIA L4",
-            "gpuRAM": "96 GB",
-            "quantization": "none",
-            "tgi": "DJL 0.28 vLLM",
-            "status": "OK",
-            "tokensPerSecond": 50,
-            "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
-          },
-          {
-            "cloud": "us-west-2",
-            "instanceType": "g6.48xlarge",
-            "gpu": "8xNVIDIA L4",
-            "gpuRAM": "192 GB",
-            "quantization": "none",
-            "tgi": "DJL 0.28 vLLM",
-            "status": "OK",
-            "tokensPerSecond": 69,
-            "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
-          },
-          {
-            "cloud": "us-west-2",
-            "instanceType": "p4d.24xlarge",
-            "gpu": "4xNVIDIA A100",
-            "gpuRAM": "320 GB",
-            "quantization": "none",
-            "tgi": "DJL 0.28 vLLM",
-            "status": "OK",
-            "tokensPerSecond": 82,
-            "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
-          }
-        ]
-      }
     ]
-  }

 results = {
     "models": [
+        {
+            "name": "Arcee-Meraj",
+            "modelType": "Qwen2 72B",
             "configurations": [
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g5.12xlarge",
+                    "cloud": "AWS",
+                    "gpu": "4xNVIDIA A10G",
+                    "gpuRAM": "96 GB",
+                    "quantization": "awq",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "33",
+                    "notes": "",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "p4d.24xlarge",
+                    "cloud": "AWS",
+                    "gpu": "4xNVIDIA A100",
+                    "gpuRAM": "320 GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "38",
+                    "notes": "",
+                }
+            ],
+        },
+        {
+            "name": "Arcee-SuperNova",
+            "modelType": "Llama 3.1 70B",
             "configurations": [
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g5.12xlarge",
+                    "cloud": "AWS",
+                    "gpu": "4xNVIDIA A10G",
+                    "gpuRAM": "96 GB",
+                    "quantization": "awq",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "33",
+                    "notes": "",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "p4d.24xlarge",
+                    "cloud": "AWS",
+                    "gpu": "4xNVIDIA A100",
+                    "gpuRAM": "320 GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "38",
+                    "notes": "",
+                }
+            ],
+        },
+        {
+            "name": "Arcee-Nova",
+            "modelType": "Qwen2 72B",
+            "notes": "",
             "configurations": [
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g4dn.12xlarge",
+                    "cloud": "AWS",
+                    "gpu": "4xNVIDIA T4",
+                    "gpuRAM": "64 GB",
+                    "quantization": "bitsandbytes-nf4",
+                    "tgi": "TGI 2.2.0",
+                    "status": "KO",
+                    "tokensPerSecond": "-",
+                    "notes": "Flash Attention requires Ampere GPUs or newer",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g5.12xlarge",
+                    "cloud": "AWS",
+                    "gpu": "4xNVIDIA A10G",
+                    "gpuRAM": "96 GB",
+                    "configurations": [
+                        {
+                            "quantization": "bitsandbytes-nf4",
+                            "tgi": "TGI 2.2.0",
+                            "status": "OK",
+                            "tokensPerSecond": "12",
+                        },
+                        {
+                            "quantization": "bitsandbytes-fp4",
+                            "tgi": "TGI 2.2.0",
+                            "status": "OK",
+                            "tokensPerSecond": "12",
+                        },
+                        {
+                            "quantization": "bitsandbytes (int8)",
+                            "tgi": "TGI 2.2.0",
+                            "status": "KO",
+                            "tokensPerSecond": "-",
+                            "notes": "CUDA OOM",
+                        },
+                        {
+                            "quantization": "eetq (int8)",
+                            "tgi": "TGI 2.2.0",
+                            "status": "KO",
+                            "tokensPerSecond": "-",
+                            "notes": "[FT Error] Heurisitc failed to find a valid config.",
+                        },
+                    ],
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g5.48xlarge",
+                    "cloud": "AWS",
+                    "gpu": "8xNVIDIA A10G",
+                    "gpuRAM": "192 GB",
+                    "configurations": [
+                        {
+                            "quantization": "none",
+                            "tgi": "TGI 2.2.0",
+                            "status": "KO",
+                            "tokensPerSecond": "-",
+                            "notes": "CUDA OOM (but g6.48xlarge works!)",
+                        },
+                        {
+                            "quantization": "bitsandbytes-nf4",
+                            "tgi": "TGI 2.2.0",
+                            "status": "OK",
+                            "tokensPerSecond": "12.3",
+                        },
+                        {
+                            "quantization": "bitsandbytes-fp4",
+                            "tgi": "TGI 2.2.0",
+                            "status": "OK",
+                            "tokensPerSecond": "12.5",
+                        },
+                        {
+                            "quantization": "bitsandbytes (int8)",
+                            "tgi": "TGI 2.2.0",
+                            "status": "KO",
+                            "tokensPerSecond": "-",
+                            "notes": "The model deploys, but inference times out.",
+                        },
+                    ],
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g6.12xlarge",
+                    "cloud": "AWS",
+                    "gpu": "4xNVIDIA L4",
+                    "gpuRAM": "96 GB",
+                    "configurations": [
+                        {
+                            "quantization": "bitsandbytes-nf4",
+                            "tgi": "TGI 2.2.0",
+                            "status": "OK",
+                            "tokensPerSecond": "1.5-2",
+                            "notes": "Too slow, timeouts are likely",
+                        },
+                        {
+                            "quantization": "bitsandbytes-fp4",
+                            "tgi": "TGI 2.2.0",
+                            "status": "OK",
+                            "tokensPerSecond": "2",
+                            "notes": "Too slow, timeouts are likely",
+                        },
+                        {
+                            "quantization": "bitsandbytes (int8)",
+                            "tgi": "TGI 2.2.0",
+                            "status": "KO",
+                            "tokensPerSecond": "-",
+                            "notes": "CUDA OOM",
+                        },
+                    ],
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g6.48xlarge",
+                    "cloud": "AWS",
+                    "gpu": "8xNVIDIA L4",
+                    "gpuRAM": "192 GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "12",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "p4d.24xlarge",
+                    "cloud": "AWS",
+                    "gpu": "8xNVIDIA A100",
+                    "gpuRAM": "320 GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "40",
+                    "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "p4de.24xlarge",
+                    "cloud": "AWS",
+                    "gpu": "8xNVIDIA A100",
+                    "gpuRAM": "320 GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "waiting for quota",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "p5.48xlarge",
+                    "cloud": "AWS",
+                    "gpu": "8xNVIDIA H100",
+                    "gpuRAM": "640GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "58",
+                    "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "inf2.*",
+                    "cloud": "AWS",
+                    "gpu": "-",
+                    "tgi": "TGI 2.2.0",
+                    "status": "not supported",
+                    "tokensPerSecond": "-",
+                    "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
+                },
+            ],
+        },
+        {
+            "name": "Llama-Spark",
+            "modelType": "Llama 3.1 8B",
             "configurations": [
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g5.2xlarge",
+                    "cloud": "AWS",
+                    "gpu": "1xNVIDIA A10G",
+                    "gpuRAM": "24 GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "29",
+                    "notes": "4K/8K fails",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g5.12xlarge",
+                    "cloud": "AWS",
+                    "gpu": "4xNVIDIA A10G",
+                    "gpuRAM": "96 GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "85",
+                    "notes": '"MAX_INPUT_TOKENS": "16384", "MAX_TOTAL_TOKENS": "32768",',
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g5.48xlarge",
+                    "cloud": "AWS",
+                    "gpu": "8xNVIDIA A10G",
+                    "gpuRAM": "192 GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "105",
+                    "notes": '"MAX_INPUT_TOKENS": "20480", "MAX_TOTAL_TOKENS": "40960"\n\n32K/64K fails',
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g6.2xlarge",
+                    "cloud": "AWS",
+                    "gpu": "1xNVIDIA L4",
+                    "gpuRAM": "24 GB",
+                    "configurations": [
+                        {
+                            "quantization": "none",
+                            "tgi": "TGI 2.2.0",
+                            "status": "OK",
+                            "tokensPerSecond": "15",
+                        },
+                        {"quantization": "fp8", "tgi": "TGI 2.2.0"},
+                    ],
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g6.12xlarge",
+                    "cloud": "AWS",
+                    "gpu": "4xNVIDIA L4",
+                    "gpuRAM": "96 GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "51",
+                    "notes": "same as g5?",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g6.48xlarge",
+                    "cloud": "AWS",
+                    "gpu": "8xNVIDIA L4",
+                    "gpuRAM": "192 GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "81",
+                    "notes": "same as g5?",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g6e.2xlarge",
+                    "cloud": "AWS",
+                    "gpu": "1xNVIDIA L40S",
+                    "gpuRAM": "48 GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "42",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "p4d.24xlarge",
+                    "cloud": "AWS",
+                    "gpu": "4xNVIDIA A100",
+                    "gpuRAM": "320 GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "145",
+                    "notes": '"MAX_INPUT_TOKENS": "40960", "MAX_TOTAL_TOKENS": "81920"\n\n64K/128K fails (even with 4-bit)',
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "inf2.*",
+                    "cloud": "AWS",
+                    "gpu": "-",
+                    "status": "not supported",
+                    "tokensPerSecond": "-",
+                    "notes": "Llama-3.1: TGI OK, Neuron SDK OK, optimum-neuron KO",
+                },
+            ],
+        },
+        {
+            "name": "Arcee-Agent",
+            "modelType": "Qwen2 7B",
+            "notes": "",
             "configurations": [
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g5.2xlarge",
+                    "cloud": "AWS",
+                    "gpu": "1xNVIDIA A10G",
+                    "gpuRAM": "24 GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "30",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g5.12xlarge",
+                    "cloud": "AWS",
+                    "gpu": "4xNVIDIA A10G",
+                    "gpuRAM": "96 GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "83",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g5.48xlarge",
+                    "cloud": "AWS",
+                    "gpu": "8xNVIDIA A10G",
+                    "gpuRAM": "192 GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "KO",
+                    "tokensPerSecond": "-",
+                    "notes": "ValueError: `num_heads` must be divisible by `num_shards` (got `num_heads`: 28 and `num_shards`: 8\n\nSM_NUM_GPUS=7 doesn't work either because tensor size ares not a multiple of 7 (e.g., 512)",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g6.2xlarge",
+                    "cloud": "AWS",
+                    "gpu": "1xNVIDIA L4",
+                    "gpuRAM": "24 GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "16.3",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g6.12xlarge",
+                    "cloud": "AWS",
+                    "gpu": "4xNVIDIA L4",
+                    "gpuRAM": "96 GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "54.2",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "inf2.*",
+                    "cloud": "AWS",
+                    "gpu": "-",
+                    "tgi": "TGI 2.2.0",
+                    "status": "not supported",
+                    "tokensPerSecond": "-",
+                    "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
+                },
+            ],
+        },
+        {"name": "Arcee-Spark", "modelType": "Qwen2 7B"},
+        {
+            "name": "Arcee-Lite",
+            "modelType": "Qwen2 1.5B distilled from phi-3-medium 14B",
             "configurations": [
+                {
+                    "region": "us-west-2",
+                    "instanceType": "c6i.xlarge",
+                    "cloud": "AWS",
+                    "gpu": "-",
+                    "gpuRAM": "-",
+                    "quantization": "bitsandbytes-nf4",
+                    "tgi": "TGI 2.2.0",
+                    "status": "KO",
+                    "tokensPerSecond": "-",
+                    "notes": "OOM, might work with a prequantized model",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "c6i.2xlarge",
+                    "cloud": "AWS",
+                    "gpu": "-",
+                    "gpuRAM": "-",
+                    "quantization": "bitsandbytes-nf4",
+                    "tgi": "TGI 2.2.0",
+                    "status": "KO",
+                    "tokensPerSecond": "-",
+                    "notes": "OOM, might work with a prequantized model",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "c6i.4xlarge",
+                    "cloud": "AWS",
+                    "gpu": "-",
+                    "gpuRAM": "-",
+                    "configurations": [
+                        {
+                            "quantization": "none",
+                            "tgi": "TGI 2.2.0",
+                            "status": "OK",
+                            "tokensPerSecond": "10.7",
+                        },
+                        {
+                            "quantization": "bitsandbytes (int8)",
+                            "tgi": "TGI 2.2.0",
+                            "status": "OK",
+                            "tokensPerSecond": "10.5",
+                        },
+                        {
+                            "quantization": "bitsandbytes-nf4",
+                            "tgi": "TGI 2.2.0",
+                            "status": "OK",
+                            "tokensPerSecond": "10.6",
+                        },
+                    ],
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "c7i.4xlarge",
+                    "cloud": "AWS",
+                    "gpu": "-",
+                    "gpuRAM": "-",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "waiting for quota",
+                    "tokensPerSecond": "-",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g5.xlarge",
+                    "cloud": "AWS",
+                    "gpu": "1xNVIDIA A10G",
+                    "gpuRAM": "24 GB",
+                    "configurations": [
+                        {
+                            "quantization": "none",
+                            "tgi": "TGI 2.2.0",
+                            "status": "OK",
+                            "tokensPerSecond": "110",
+                        },
+                        {
+                            "quantization": "none",
+                            "tgi": "DJL 0.28 vLLM",
+                            "status": "OK",
+                            "tokensPerSecond": "105",
+                            "notes": '"OPTION_MAX_MODEL_LEN": "32768",',
+                        },
+                    ],
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g6e.2xlarge",
+                    "cloud": "AWS",
+                    "gpu": "1xNVIDIA L40S",
+                    "gpuRAM": "48 GB",
+                    "quantization": "none",
+                    "tgi": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "160",
+                },
+            ],
+        },
+        {
+            "name": "Arcee-Scribe",
+            "modelType": "InternLM2.5 8B",
+            "configurations": [
+                {
+                    "cloud": "us-west-2",
+                    "instanceType": "g5.2xlarge",
+                    "gpu": "1xNVIDIA A10G",
+                    "gpuRAM": "24 GB",
+                    "quantization": "none",
+                    "tgi": "DJL 0.28 vLLM",
+                    "status": "OK",
+                    "tokensPerSecond": 29,
+                    "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
+                },
+                {
+                    "cloud": "us-west-2",
+                    "instanceType": "g5.12xlarge",
+                    "gpu": "4xNVIDIA A10G",
+                    "gpuRAM": "96 GB",
+                    "quantization": "none",
+                    "tgi": "DJL 0.28 vLLM",
+                    "status": "OK",
+                    "tokensPerSecond": 65,
+                    "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",\nNot supported by AutoAWQ and AutoGPTQ',
+                },
+                {
+                    "cloud": "us-west-2",
+                    "instanceType": "g5.48xlarge",
+                    "gpu": "8xNVIDIA A10G",
+                    "gpuRAM": "192 GB",
+                    "quantization": "none",
+                    "tgi": "DJL 0.28 vLLM",
+                    "status": "OK",
+                    "tokensPerSecond": 80,
+                    "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
+                },
+                {
+                    "cloud": "us-west-2",
+                    "instanceType": "g6.2xlarge",
+                    "gpu": "1xNVIDIA L4",
+                    "gpuRAM": "24 GB",
+                    "quantization": "none",
+                    "tgi": "DJL 0.28 vLLM",
+                    "status": "OK",
+                    "tokensPerSecond": 16,
+                    "notes": '"OPTION_MAX_MODEL_LEN": "4096"',
+                },
+                {
+                    "cloud": "us-west-2",
+                    "instanceType": "g6.12xlarge",
+                    "gpu": "4xNVIDIA L4",
+                    "gpuRAM": "96 GB",
+                    "quantization": "none",
+                    "tgi": "DJL 0.28 vLLM",
+                    "status": "OK",
+                    "tokensPerSecond": 50,
+                    "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
+                },
+                {
+                    "cloud": "us-west-2",
+                    "instanceType": "g6.48xlarge",
+                    "gpu": "8xNVIDIA L4",
+                    "gpuRAM": "192 GB",
+                    "quantization": "none",
+                    "tgi": "DJL 0.28 vLLM",
+                    "status": "OK",
+                    "tokensPerSecond": 69,
+                    "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
+                },
+                {
+                    "cloud": "us-west-2",
+                    "instanceType": "p4d.24xlarge",
+                    "gpu": "4xNVIDIA A100",
+                    "gpuRAM": "320 GB",
+                    "quantization": "none",
+                    "tgi": "DJL 0.28 vLLM",
+                    "status": "OK",
+                    "tokensPerSecond": 82,
+                    "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
+                },
+            ],
+        },
     ]
+}