Spaces:

arcee-ai
/

Benchmarks

Running

File size: 7,457 Bytes

"""Module containing performance results for the Arcee-SuperNova model."""

results_arcee_supernova = {
    "name": "Arcee-SuperNova",
    "modelType": "Llama 3.1 70B",
    "configurations": [
        {
            "instanceType": "c7g.16xlarge",
            "quantization": "Q4_0_8_8",
            "container": "llama.cpp 9/19/24",
            "status": "OK",
            "tokensPerSecond": "6.5",
            "notes": "",
        },
        {
            "instanceType": "r8g.16xlarge",
            "quantization": "Q4_0_4_8",
            "container": "llama.cpp 9/19/24",
            "status": "OK",
            "tokensPerSecond": "9.8",
            "notes": "With Flash Attention",
        },
        {
            "instanceType": "g5.12xlarge",
            "quantization": "awq",
            "container": "TGI 2.2.0",
            "status": "OK",
            "tokensPerSecond": "33",
            "notes": "MAX_INPUT_TOKENS: 8192, MAX_TOTAL_TOKENS: 16384",
        },
        {
            "instanceType": "g6e.2xlarge",
            "quantization": "awq (w4 g128)",
            "container": "vLLM 0.6.2",
            "status": "OK",
            "tokensPerSecond": "18",
            "notes": "--max-model-len 10000 --max-num-seqs 16 --enforce-eager",
        },
        {
            "instanceType": "g6e.2xlarge",
            "quantization": "Q4_K_M",
            "container": "llama.cpp 10/2/24",
            "status": "OK",
            "tokensPerSecond": "16",
            "notes": "-ngl 81 -c 13000 -fa -t 8",
        },
        {
            "instanceType": "g6e.12xlarge",
            "quantization": "none",
            "container": "vLLM 0.6.3",
            "status": "OK",
            "tokensPerSecond": "18.6",
            "notes": "--max-model-len 16384",
        },
        {
            "instanceType": "p4d.24xlarge",
            "quantization": "awq",
            "container": "TGI 2.2.0",
            "status": "OK",
            "tokensPerSecond": "58",
            "notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768",
        },
        {
            "instanceType": "p5.48xlarge",
            "quantization": "awq",
            "container": "TGI 2.2.0",
            "status": "OK",
            "tokensPerSecond": "73",
            "notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768",
        },
        {
            "instanceType": "inf2.24xlarge",
            "configurations": [
                {
                    "quantization": "none",
                    "container": "LMI 0.29+transformers-neuronx 0.11.351",
                    "status": "KO",
                    "tokensPerSecond": "-",
                    "notes": "OOM bs=2,seqlen=4096 - SDK 2.19.1",
                },
                {
                    "quantization": "none",
                    "container": "LMI 0.29+transformers-neuronx 0.11.351",
                    "status": "KO",
                    "tokensPerSecond": "-",
                    "notes": "OOM bs=2,seqlen=2048 - SDK 2.19.1",
                },
                {
                    "quantization": "8-bit",
                    "container": "LMI 0.29+transformers-neuronx 0.11.351",
                    "status": "???",
                    "tokensPerSecond": "???",
                    "notes": "bs=2,seqlen=8192 - SDK 2.19.1 - OPTION_LOAD_IN_8BIT=True",
                },
            ],
        },
        {
            "instanceType": "inf2.48xlarge",
            "configurations": [
                {
                    "quantization": "none",
                    "container": "LMI 0.29+transformers-neuronx 0.11.351",
                    "status": "OK",
                    "tokensPerSecond": "28",
                    "notes": "bs=4,seqlen=4096 - SDK 2.19.1",
                },
                {
                    "quantization": "none",
                    "container": "LMI 0.29+transformers-neuronx 0.11.351",
                    "status": "OK",
                    "tokensPerSecond": "24",
                    "notes": "bs=2,seqlen=8192 - SDK 2.19.1",
                },
                {
                    "quantization": "none",
                    "container": "LMI 0.29+transformers-neuronx 0.11.351",
                    "status": "KO",
                    "tokensPerSecond": "-",
                    "notes": "OOM bs=2,seqlen=16384 - SDK 2.19.1",
                },
            ],
        },
        {
            "instanceType": "trn1.32xlarge",
            "configurations": [
                {
                    "quantization": "none",
                    "container": "LMI 0.29+transformers-neuronx 0.11.351",
                    "status": "OK",
                    "tokensPerSecond": "32",
                    "notes": "bs=2,seqlen=8192 - SDK 2.19.1",
                },
                {
                    "quantization": "none",
                    "container": "LMI 0.30rc1",
                    "status": "OK",
                    "tokensPerSecond": "34",
                    "notes": "bs=2,seqlen=8192 - SDK 2.20",
                },
            ],
        },
        {
            "instanceType": "p4d.24xlarge",
            "configurations": [
                {
                    "quantization": "none",
                    "container": "TGI 2.2.0",
                    "status": "OK",
                    "tokensPerSecond": "30",
                    "notes": "",
                },
                {
                    "quantization": "none",
                    "container": "LMI 0.29+vLLM 0.5.5",
                    "status": "OK",
                    "tokensPerSecond": "45",
                    "notes": "OPTION_MAX_MODEL_LEN 64k",
                },
            ],
        },
        {
            "instanceType": "p5.48xlarge",
            "configurations": [
                {
                    "quantization": "none",
                    "container": "TGI 2.2.0",
                    "status": "OK",
                    "tokensPerSecond": "58",
                    "notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768",
                },
                {
                    "quantization": "none",
                    "container": "LMI 0.29+vLLM 0.5.5",
                    "status": "OK",
                    "tokensPerSecond": "70",
                    "notes": "OPTION_MAX_MODEL_LEN 128k",
                },
                {
                    "quantization": "none",
                    "container": "LMI 0.29+vLLM 0.5.5",
                    "status": "OK",
                    "tokensPerSecond": "70",
                    "notes": "OPTION_ENFORCE_EAGER=True",
                },
            ],
        },
        {
            "instanceType": "p5.48xlarge",
            "quantization": "none",
            "container": "vLLM 0.6.4.post1",
            "status": "OK",
            "tokensPerSecond": "77",
            "notes": "--tensor-parallel-size 8",
        },
        {
            "instanceType": "p5.48xlarge (4 GPUs)",
            "quantization": "none",
            "container": "vLLM 0.6.4.post1",
            "status": "OK",
            "tokensPerSecond": "53",
            "notes": "--tensor-parallel-size 4",
        },
        {
            "instanceType": "p5.48xlarge (2 GPUs)",
            "quantization": "none",
            "container": "vLLM 0.6.4.post1",
            "status": "OK",
            "tokensPerSecond": "33",
            "notes": "--tensor-parallel-size 2",
        },
    ],
}