File size: 5,070 Bytes
ab5f5f1 a1135a9 ab5f5f1 a8a6326 ab5f5f1 0232cf1 ab5f5f1 a1f6c2e ab5f5f1 a8a6326 0232cf1 a1f6c2e 0232cf1 a8a6326 ab5f5f1 0232cf1 ab5f5f1 14d526b eeaa368 a8a6326 eeaa368 ab5f5f1 a8a6326 ab5f5f1 a1135a9 08604d0 a1135a9 ab5f5f1 a1135a9 a8a6326 0232cf1 774f9d7 a1135a9 ab5f5f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import os
import pandas as pd
from huggingface_hub import hf_hub_download
from .utils import process_quantization_scheme, process_arch
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
HF_TOKEN = os.environ.get("HF_TOKEN", None)
COLUMNS_MAPPING = {
"Model": "Model π€",
"experiment_name": "Experiment π§ͺ",
# primary measurements
"forward.latency(s)": "Prefill (s)",
"decode.throughput(tokens/s)": "Decode (tokens/s)",
"generate.max_memory_allocated(MB)": "Memory (MB)",
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
# deployment settings
"backend.name": "Backend π",
"backend.torch_dtype": "DType π₯",
"optimization": "Optimization π οΈ",
"quantization": "Quantization ποΈ",
# additional measurements
"Size": "Params (B)",
"Arch": "Architecture ποΈ",
"Score": "Open LLM Score (%)",
"generate.latency(s)": "End-to-End (s)",
"generate.throughput(tokens/s)": "End-to-End (tokens/s)",
"generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
"generate.max_memory_used(MB)": "Used Memory (MB)",
}
SORTING_COLUMNS = [
"Open LLM Score (%)",
"Decode (tokens/s)",
"Prefill (s)",
]
SORTING_ASCENDING = [False, True, False]
def get_llm_df():
# commented for now since scraping script is not working
hf_hub_download(
repo_id=LLM_PERF_DATASET_REPO,
filename="open-llm.csv",
local_dir="dataset",
repo_type="dataset",
token=HF_TOKEN,
)
llm_df = pd.read_csv("dataset/open-llm.csv")
return llm_df
def get_perf_df(machine: str = "hf-dgx-01"):
hf_hub_download(
repo_id=LLM_PERF_DATASET_REPO,
filename=f"{machine}/perf-report.csv",
local_dir="dataset",
repo_type="dataset",
token=HF_TOKEN,
)
perf_df = pd.read_csv(f"dataset/{machine}/perf-report.csv")
return perf_df
def get_llm_perf_df(machine: str = "hf-dgx-01"):
# get dataframes
llm_df = get_llm_df()
perf_df = get_perf_df(machine=machine)
llm_perf_df = pd.merge(llm_df, perf_df, left_on="Model", right_on="model")
# some assertions
assert llm_perf_df["benchmark.input_shapes.batch_size"].nunique() == 1
assert llm_perf_df["benchmark.input_shapes.sequence_length"].nunique() == 1
assert llm_perf_df["benchmark.new_tokens"].nunique() == 1
# transpose energy consumption
llm_perf_df["generate.energy_consumption(tokens/kWh)"] = (
1 / llm_perf_df["generate.energy_consumption(kWh/token)"].fillna(1)
).astype(int)
# fix nan values
llm_perf_df.loc[
llm_perf_df["generate.energy_consumption(tokens/kWh)"] == 1,
"generate.energy_consumption(tokens/kWh)",
] = pd.NA
# add optimization column
llm_perf_df["optimization"] = llm_perf_df[["backend.to_bettertransformer", "backend.use_flash_attention_2"]].apply(
lambda x: (
"BetterTransformer"
if x["backend.to_bettertransformer"]
else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None")
),
axis=1,
)
# add quantization scheme
llm_perf_df["quantization"] = llm_perf_df[
[
"backend.quantization_scheme",
"backend.quantization_config.bits",
"backend.quantization_config.version",
"backend.quantization_config.load_in_4bit",
"backend.quantization_config.load_in_8bit",
"backend.quantization_config.exllama_config.version",
]
].apply(lambda x: process_quantization_scheme(x), axis=1)
# process experiment name
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("pytorch+cuda+", ""))
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(
lambda x: x.replace("float16+", "").replace("float32+", "").replace("bfloat16+", "") if "bit" in x else x
)
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bnb-4bit", "BnB-4bit"))
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bnb-8bit", "BnB-8bit"))
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("awq-4bit", "AWQ-4bit"))
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("gptq-4bit", "GPTQ-4bit"))
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bettertransformer", "SDPA"))
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("flash-attention-v2", "FA-v2"))
# add arch
llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
# filter columns
llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
# rename columns
llm_perf_df.rename(columns=COLUMNS_MAPPING, inplace=True)
# sort by metric
llm_perf_df.sort_values(
by=SORTING_COLUMNS,
ascending=SORTING_ASCENDING,
inplace=True,
)
return llm_perf_df
|