File size: 4,130 Bytes
ab5f5f1
 
 
 
 
a1135a9
 
ab5f5f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14d526b
 
 
 
 
 
 
 
 
 
ab5f5f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1135a9
08604d0
a1135a9
 
ab5f5f1
 
a1135a9
 
 
ab5f5f1
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os

import pandas as pd
from huggingface_hub import hf_hub_download

from .utils import process_quantization_scheme, process_arch

LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
HF_TOKEN = os.environ.get("HF_TOKEN", None)

COLUMNS_MAPPING = {
    "Model": "Model πŸ€—",
    "Arch": "Arch πŸ›οΈ",
    "Size": "Params (B)",
    "Score": "Open LLM Score (%)",
    # deployment settings
    "backend.name": "Backend 🏭",
    "backend.torch_dtype": "DType πŸ“₯",
    "optimization": "Optimization πŸ› οΈ",
    "quantization": "Quantization πŸ—œοΈ",
    # primary measurements
    "forward.latency(s)": "Prefill Latency (s)",
    "decode.throughput(tokens/s)": "Decode Throughput (tokens/s)",
    "generate.max_memory_allocated(MB)": "Allocated Memory (MB)",
    "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
    # additional measurements
    "generate.latency(s)": "E2E Latency (s)",
    "generate.throughput(tokens/s)": "E2E Throughput (tokens/s)",
    "generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
    "generate.max_memory_used(MB)": "Used Memory (MB)",
}
SORTING_COLUMNS = [
    "Open LLM Score (%)",
    "Prefill Latency (s)",
    "Decode Throughput (tokens/s)",
]
SORTING_ASCENDING = [False, True, False]


def get_llm_df():
    # commented for now since scraping script is not working
    # hf_hub_download(
    #     repo_id=LLM_PERF_DATASET_REPO,
    #     filename="open-llm.csv",
    #     local_dir="dataset",
    #     repo_type="dataset",
    #     token=HF_TOKEN,
    # )
    # llm_df = pd.read_csv("dataset/open-llm.csv")
    llm_df = pd.read_csv("https://huggingface.co/datasets/optimum/llm-perf-dataset/raw/e8628583f0c31457cd5f8b81352735263117fbb4/open-llm.csv")

    return llm_df


def get_perf_df(machine: str = "hf-dgx-01"):
    hf_hub_download(
        repo_id=LLM_PERF_DATASET_REPO,
        filename=f"{machine}/perf-report.csv",
        local_dir="dataset",
        repo_type="dataset",
        token=HF_TOKEN,
    )
    perf_df = pd.read_csv(f"dataset/{machine}/perf-report.csv")

    return perf_df


def get_llm_perf_df(machine: str = "hf-dgx-01"):
    # get dataframes
    llm_df = get_llm_df()
    perf_df = get_perf_df(machine=machine)
    llm_perf_df = pd.merge(llm_df, perf_df, left_on="Model", right_on="model")
    # some assertions
    assert llm_perf_df["benchmark.input_shapes.batch_size"].nunique() == 1
    assert llm_perf_df["benchmark.input_shapes.sequence_length"].nunique() == 1
    assert llm_perf_df["benchmark.new_tokens"].nunique() == 1
    # transpose energy consumption
    llm_perf_df["generate.energy_consumption(tokens/kWh)"] = (
        1 / llm_perf_df["generate.energy_consumption(kWh/token)"].fillna(1)
    ).astype(int)
    # fix nan values
    llm_perf_df.loc[
        llm_perf_df["generate.energy_consumption(tokens/kWh)"] == 1,
        "generate.energy_consumption(tokens/kWh)",
    ] = pd.NA

    # add optimization column
    llm_perf_df["optimization"] = llm_perf_df[["backend.to_bettertransformer", "backend.use_flash_attention_2"]].apply(
        lambda x: "BetterTransformer"
        if x["backend.to_bettertransformer"]
        else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None"),
        axis=1,
    )
    # add quantization scheme
    llm_perf_df["quantization"] = llm_perf_df[
        [
            "backend.quantization_scheme",
            "backend.quantization_config.bits",
            "backend.quantization_config.version",
            "backend.quantization_config.load_in_4bit",
            "backend.quantization_config.load_in_8bit",
            "backend.quantization_config.exllama_config.version",
        ]
    ].apply(lambda x: process_quantization_scheme(x), axis=1)
    # add arch
    llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
    # filter columns
    llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
    # rename columns
    llm_perf_df.rename(columns=COLUMNS_MAPPING, inplace=True)
    # sort by metric
    llm_perf_df.sort_values(
        by=SORTING_COLUMNS,
        ascending=SORTING_ASCENDING,
        inplace=True,
    )

    return llm_perf_df