| import copy |
| import logging |
| import os |
|
|
| |
| |
| from datetime import datetime |
| from pathlib import Path |
|
|
| import pandas as pd |
| import torch |
| from auto_gptq import BaseQuantizeConfig as GPTQQuantConfig |
| from hqq.core.quantize import BaseQuantizeConfig as HQQQuantConfig |
| from transformers import BitsAndBytesConfig |
|
|
| from lm_quant_toolkit.adapter.autoawq import ( |
| create_autoawq_model, |
| quantize_autoawq_model, |
| ) |
| from lm_quant_toolkit.adapter.autogptq import ( |
| create_autogptq_model, |
| quantize_autogptq_model, |
| ) |
| from lm_quant_toolkit.adapter.bnb import create_bnb_model, quantize_bnb_model |
| from lm_quant_toolkit.adapter.fp16 import create_fp16_model |
| from lm_quant_toolkit.adapter.hqq import create_hqq_model, quantize_hqq_model |
| from lm_quant_toolkit.adapter.mxq import create_mxq_model, quantize_mxq_model |
| from lm_quant_toolkit.eval.common import ( |
| HQQ_CONFIGS, |
| _dump_cuda_mem_snapshot, |
| _reset_peak_memory_stats, |
| cleanup, |
| combine_metrics, |
| get_memory_metrics, |
| get_mxq_quant_meta_data_file, |
| persist_progress, |
| save_partial_metric, |
| ) |
| from lm_quant_toolkit.eval.leaderboard import eval_llm_leaderboard |
| from lm_quant_toolkit.eval.perplexity import eval_ppls |
|
|
| ALL_MODELS = [ |
| "meta-llama/Llama-2-7b-hf", |
| "meta-llama/Llama-2-13b-hf", |
| "meta-llama/Meta-Llama-3-8B", |
| "meta-llama/Llama-3.1-8B", |
| ] |
|
|
| MXQ_CONFIGS = [ |
| ( |
| f"{bits:.2f}".replace(".", "_"), |
| HQQQuantConfig(mixed=True, budget=bits, quant_scale=True), |
| ) |
| for bits in [5.00, 4.75, 4.50, 4.25, 4.01, 3.76, 3.50, 3.00, 2.75, 2.48] |
| ] |
|
|
| BNB_CONFIGS = [ |
| ( |
| "b4g64", |
| BitsAndBytesConfig( |
| load_in_4bit=True, |
| bnb_4bit_quant_type="nf4", |
| bnb_4bit_compute_dtype="float16", |
| bnb_4bit_use_double_quant=True, |
| ), |
| ), |
| ( |
| "b8g128", |
| BitsAndBytesConfig( |
| load_in_8bit=True, |
| bnb_4bit_compute_dtype="float16", |
| ), |
| ), |
| ] |
|
|
|
|
| AUTOAWQ_CONFIGS = [ |
| ("b4g32", {"w_bit": 4, "q_group_size": 32, "zero_point": True, "version": "GEMM"}), |
| ("b4g64", {"w_bit": 4, "q_group_size": 64, "zero_point": True, "version": "GEMM"}), |
| ( |
| "b4g128", |
| {"w_bit": 4, "q_group_size": 128, "zero_point": True, "version": "GEMM"}, |
| ), |
| |
| |
| |
| ] |
|
|
| GPTQ_CONFIGS = [ |
| ( |
| "b8g32", |
| GPTQQuantConfig(bits=8, group_size=32, damp_percent=0.01, desc_act=False), |
| ), |
| ( |
| "b8g64", |
| GPTQQuantConfig(bits=8, group_size=64, damp_percent=0.01, desc_act=False), |
| ), |
| ( |
| "b8g128", |
| GPTQQuantConfig(bits=8, group_size=128, damp_percent=0.01, desc_act=False), |
| ), |
| ( |
| "b4g32", |
| GPTQQuantConfig(bits=4, group_size=32, damp_percent=0.01, desc_act=False), |
| ), |
| ( |
| "b4g64", |
| GPTQQuantConfig(bits=4, group_size=64, damp_percent=0.01, desc_act=False), |
| ), |
| ( |
| "b4g128", |
| GPTQQuantConfig(bits=4, group_size=128, damp_percent=0.01, desc_act=False), |
| ), |
| ( |
| "b3g32", |
| GPTQQuantConfig(bits=3, group_size=32, damp_percent=0.01, desc_act=False), |
| ), |
| ( |
| "b3g64", |
| GPTQQuantConfig(bits=3, group_size=64, damp_percent=0.01, desc_act=False), |
| ), |
| ( |
| "b3g128", |
| GPTQQuantConfig(bits=3, group_size=128, damp_percent=0.01, desc_act=False), |
| ), |
| ] |
|
|
|
|
| def gen_experiment_items(models, tasks): |
| dikts = [] |
| for algo, spec in tasks.items(): |
| configs = spec["configs"] |
| for config in configs: |
| for model_id in models: |
| dikts.append( |
| { |
| "model": model_id, |
| "cfg": config[0], |
| "task_type": spec["type"], |
| "algo": algo, |
| } |
| ) |
| return pd.DataFrame(dikts) |
|
|
|
|
| def _setup_fn(algo, spec): |
| match algo: |
| case "fp16": |
| spec["create_fn"] = create_fp16_model |
| spec["quantize_fn"] = None |
| case "awq": |
| spec["create_fn"] = create_autoawq_model |
| spec["quantize_fn"] = quantize_autoawq_model |
| case "gptq": |
| spec["create_fn"] = create_autogptq_model |
| spec["quantize_fn"] = quantize_autogptq_model |
| case "hqq": |
| spec["create_fn"] = create_hqq_model |
| spec["quantize_fn"] = quantize_hqq_model |
| case "bnb": |
| spec["create_fn"] = create_bnb_model |
| spec["quantize_fn"] = quantize_bnb_model |
| case "mxq": |
| spec["create_fn"] = create_mxq_model |
| spec["quantize_fn"] = quantize_mxq_model |
| case _: |
| raise ValueError(f"Invalid algo: {algo}") |
|
|
|
|
| def do_expermient( |
| experiment_name, |
| models, |
| tasks, |
| quant_dir="snapshots", |
| result_dir="results", |
| log_dir="logs", |
| track_cuda_memory=False, |
| **kwargs, |
| ): |
| df_all = gen_experiment_items(models, tasks) |
| progress_path = os.path.join(result_dir, experiment_name, "progress.csv") |
| if Path(progress_path).exists(): |
| df_saved = pd.read_csv(progress_path) |
| df_all = df_all.merge( |
| df_saved, how="left", on=["model", "cfg", "task_type", "algo"] |
| ) |
| |
| df_todo = df_all.query("status != status or status != 1") |
| else: |
| df_all["status"] = 0 |
| df_all["completion_time"] = "" |
| df_todo = df_all |
| print("*" * 72) |
| print("Sub-task list:") |
| print(df_all) |
| cnt_todo, cnt_tot = len(df_todo), len(df_all) |
| print(f"Todo:{cnt_todo}, Done: {cnt_tot - cnt_todo}, Total: {cnt_tot}") |
| if cnt_todo == 0: |
| print("Tasks completed!") |
| print("*" * 72) |
| if cnt_todo == 0: |
| return |
|
|
| df_todo = df_todo.sort_values(by=["model", "cfg"], ascending=False) |
| for idx, row in df_todo.iterrows(): |
| model_id = row["model"] |
| algo = row["algo"] |
| task_type = row["task_type"] |
| cfg = row["cfg"] |
| spec = tasks[algo] |
| _setup_fn(algo, spec) |
| config = [c for c in spec["configs"] if c[0] == cfg][0] |
| quant_fn = spec["quantize_fn"] |
| metric = _init_metrics(model_id, algo, config) |
| print("*" * 72) |
| if task_type == "quant": |
| print(f"Quantizing {algo} on {model_id} w/ config: {cfg}...") |
| elif task_type == "eval_ppl": |
| print(f"Evaluating {algo} PPL on {model_id} w/ config: {cfg}...") |
| elif task_type == "eval_leaderboard": |
| print( |
| f"Evaluating {algo} LLM Leaderboard benchmarks on {model_id} w/ config: {cfg}..." |
| ) |
| else: |
| print( |
| f"Evaluating {algo} model storage metrics on {model_id} w/ config: {cfg}..." |
| ) |
| print("*" * 72) |
|
|
| if track_cuda_memory: |
| torch.cuda.memory._record_memory_history() |
| _reset_peak_memory_stats() |
| if task_type != "eval_leaderboard": |
| create_fn = spec["create_fn"] |
| model, tokenizer, quantized, model_file_size = create_fn( |
| model_id, config[1], cfg, quant_fn is not None, quant_dir |
| ) |
|
|
| if not quantized and quant_fn: |
| |
| quant_config = copy.deepcopy(config[1]) |
| if algo == "mxq": |
| ok, metric_fp = get_mxq_quant_meta_data_file(model_id) |
| if not ok: |
| print( |
| f"Quantization meta data file: {metric_fp} doesn't exists!" |
| ) |
| return |
| quant_config["quant_metrics_file"] = metric_fp |
| quant_config["weight_algo"] = kwargs.get("weight_algo", None) |
| quant_config["boost_layers"] = kwargs.get("boost_layers", None) |
| quant_config["decline_layers"] = kwargs.get("decline_layers", None) |
| quant_config["boost_stop"] = kwargs.get("boost_stop", None) |
| quant_config["decline_stop"] = kwargs.get("decline_stop", None) |
| quant_config["ablation"] = kwargs.get("ablation", None) |
| quant_config["top_m_layer"] = kwargs.get("top_m_layer", None) |
| quant_config["factor"] = kwargs.get("factor", None) |
| model, duration, model_file_size = quant_fn( |
| model, |
| tokenizer, |
| quant_config, |
| model_id, |
| cfg, |
| quant_dir, |
| ) |
| metric["quant_duration"] = duration |
| if task_type == "eval_model_storage": |
| allot, reserved = get_memory_metrics() |
| metric["load_mem_allot"] = allot |
| metric["load_mem_reserved"] = reserved |
| metric["model_storage_size"] = model_file_size |
|
|
| elif task_type == "eval_ppl": |
| |
| metric = eval_ppls(model, tokenizer, metric) |
| metric["ppl_mem_allot"], metric["ppl_mem_reserved"] = ( |
| get_memory_metrics() |
| ) |
| if track_cuda_memory: |
| _dump_cuda_mem_snapshot(experiment_name, model_id, algo, result_dir) |
| cleanup(model) |
| else: |
| metric = eval_llm_leaderboard( |
| experiment_name, |
| model_id, |
| algo, |
| cfg, |
| quant_fn is not None, |
| metric, |
| quant_dir, |
| result_dir, |
| ) |
| metric["leaderboard_mem_allot"], metric["leaderboard_mem_reserved"] = ( |
| get_memory_metrics() |
| ) |
| if track_cuda_memory: |
| _dump_cuda_mem_snapshot(experiment_name, model_id, algo, result_dir) |
| save_partial_metric(experiment_name, algo, model_id, cfg, metric, result_dir) |
| df_all.loc[ |
| (df_all["model"] == model_id) |
| & (df_all["cfg"] == cfg) |
| & (df_all["algo"] == algo) |
| & (df_all["task_type"] == task_type), |
| ["status", "completion_time"], |
| ] = 1, datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
| persist_progress(df_all, progress_path) |
| |
| combine_metrics(experiment_name, result_dir) |
|
|
|
|
| def _init_metrics(model_id, algo, config): |
| return { |
| "model": model_id.split("/")[1], |
| "algo": algo, |
| "config": config[0], |
| "config_detail": str(config[1]).replace("\n", ""), |
| "quant_duration": 0, |
| "model_storage_size": 0, |
| "load_mem_allot": 0, |
| "load_mem_reserved": 0, |
| "ppl_mem_allot": 0, |
| "ppl_mem_reserved": 0, |
| "leaderboard_mem_allot": 0, |
| "leaderboard_mem_reserved": 0, |
| "quant_duration": 0, |
| "ppl_wikitext": 0, |
| "ppl_c4": 0, |
| "duration_wikitext": 0, |
| "duration_c4": 0, |
| "duration_leaderboard": 0, |
| "ifeval": 0, |
| "bbh": 0, |
| "mathlevel5": 0, |
| "gpqa": 0, |
| "musr": 0, |
| "mmlupro": 0, |
| } |
|
|
|
|
| def do_expermient_fdata( |
| experiment_name, |
| models, |
| tasks, |
| track_cuda_memory=False, |
| ): |
| do_expermient( |
| experiment_name, |
| models, |
| tasks, |
| quant_dir="/fdata/llm/mxq/snapshots", |
| result_dir="/fdata/llm/mxq/results", |
| log_dir="/fdata/llm/mxq/logs", |
| track_cuda_memory=track_cuda_memory, |
| ) |
|
|
|
|
| |
| |
| |
|
|
|
|
| def experiment_quant_hqq(): |
| models = ALL_MODELS |
| tasks = { |
| "hqq": { |
| "type": "quant", |
| "configs": HQQ_CONFIGS, |
| }, |
| } |
| do_expermient_fdata( |
| "quant_hqq", |
| models, |
| tasks, |
| ) |
|
|
|
|
| def experiment_quant_mxq(): |
| models = ALL_MODELS |
| type = "quant" |
| algo = "mxq" |
| tasks = { |
| algo: { |
| "type": type, |
| "configs": MXQ_CONFIGS, |
| }, |
| } |
| do_expermient_fdata(f"{type}_{algo}_mxq", models, tasks) |
|
|
|
|
| def experiment_quant_awq(): |
| |
| models = [ALL_MODELS[1]] |
| type = "quant" |
| algo = "awq" |
| tasks = { |
| algo: { |
| "type": type, |
| "configs": AUTOAWQ_CONFIGS, |
| }, |
| } |
| do_expermient_fdata(f"{type}_{algo}", models, tasks) |
|
|
|
|
| def experiment_quant_gptq(): |
| models = ALL_MODELS |
| type = "quant" |
| algo = "gptq" |
| tasks = { |
| algo: { |
| "type": type, |
| "configs": GPTQ_CONFIGS, |
| }, |
| } |
| do_expermient_fdata(f"{type}_{algo}", models, tasks) |
|
|
|
|
| def experiment_quantize_405B(): |
| models = [ |
| "meta-llama/Meta-Llama-3.1-405B-Instruct", |
| ] |
|
|
| tasks = { |
| "hqq": { |
| "type": "quant", |
| "configs": HQQ_CONFIGS[1:2], |
| }, |
| } |
| do_expermient( |
| "quant_hqq_405B", |
| models, |
| tasks, |
| quant_dir="/data/gqq-eval/snapshots/", |
| ) |
|
|
|
|
| |
| |
| |
|
|
|
|
| def experiment_ppl_eval_fp16(): |
| models = ALL_MODELS |
| type = "eval_ppl" |
| algo = "fp16" |
| tasks = { |
| algo: { |
| "type": type, |
| "configs": [ |
| ("base", {}), |
| ], |
| }, |
| } |
| do_expermient_fdata(f"{type}_{algo}", models, tasks) |
|
|
|
|
| def experiment_ppl_eval_awq(): |
| models = [ALL_MODELS[0], ALL_MODELS[2]] |
| type = "eval_ppl" |
| algo = "awq" |
| tasks = { |
| algo: { |
| "type": type, |
| "configs": AUTOAWQ_CONFIGS[1:2], |
| }, |
| } |
| do_expermient_fdata(f"{type}_{algo}", models, tasks) |
|
|
|
|
| def experiment_ppl_eval_gptq(): |
| models = ALL_MODELS |
| type = "eval_ppl" |
| algo = "gptq" |
| tasks = { |
| algo: { |
| "type": type, |
| "configs": GPTQ_CONFIGS, |
| }, |
| } |
| do_expermient_fdata(f"{type}_{algo}", models, tasks) |
|
|
|
|
| def experiment_ppl_eval_hqq(): |
| models = ALL_MODELS |
| type = "eval_ppl" |
| algo = "hqq" |
| tasks = { |
| algo: { |
| "type": type, |
| "configs": HQQ_CONFIGS, |
| }, |
| } |
| do_expermient_fdata(f"{type}_{algo}", models, tasks) |
|
|
|
|
| |
| |
| |
|
|
|
|
| def experiment_llm_leaderboard_fp16(): |
| models = [ALL_MODELS[0], ALL_MODELS[2]] |
| type = "eval_leaderboard" |
| algo = "fp16" |
| tasks = { |
| algo: { |
| "type": type, |
| "configs": [ |
| ("base", {}), |
| ], |
| }, |
| } |
| do_expermient_fdata(f"{type}_{algo}", models, tasks) |
|
|
|
|
| def experiment_llm_leaderboard_autogptq(): |
| models = ALL_MODELS |
| type = "eval_leaderboard" |
| algo = "gptq" |
| tasks = { |
| algo: { |
| "type": type, |
| "configs": GPTQ_CONFIGS, |
| }, |
| } |
| do_expermient_fdata(f"{type}_{algo}", models, tasks) |
|
|
|
|
| def experiment_llm_leaderboard_hqq(): |
| models = ALL_MODELS |
| type = "eval_leaderboard" |
| algo = "hqq" |
| tasks = { |
| algo: { |
| "type": type, |
| "configs": HQQ_CONFIGS, |
| }, |
| } |
| do_expermient_fdata(f"{type}_{algo}", models, tasks) |
|
|
|
|
| def experiment_llm_leaderboard_mxq(): |
| models = ALL_MODELS |
| type = "eval_leaderboard" |
| algo = "mxq" |
| tasks = { |
| algo: { |
| "type": type, |
| "configs": MXQ_CONFIGS, |
| }, |
| } |
| do_expermient_fdata(f"{type}_{algo}", models, tasks) |
|
|
|
|
| def experiment_llm_leaderboard_autoawq(): |
| models = ALL_MODELS[0:1] |
| type = "eval_leaderboard" |
| algo = "awq" |
| tasks = { |
| algo: { |
| "type": type, |
| "configs": AUTOAWQ_CONFIGS[0:2], |
| }, |
| } |
| do_expermient_fdata(f"{type}_{algo}", models, tasks) |
|
|
|
|
| |
| |
| |
|
|
|
|
| def experiment_quant_ppl_eval_mxq_comprise(): |
| models = ALL_MODELS |
| equiv_mxq_configs = [] |
| nbits = [4.06, 4.10, 4.15, 4.19, 4.24, 4.28, 4.33] |
| for bits in nbits: |
| cfg_name = f"mxq-{str(bits).replace('.', '_')}" |
| equiv_mxq_configs.append( |
| (cfg_name, HQQQuantConfig(mixed=True, budget=bits, quant_scale=True)) |
| ) |
| quant_tasks = { |
| "hqq": { |
| "type": "quant", |
| "configs": equiv_mxq_configs, |
| }, |
| } |
| ppl_tasks = { |
| "hqq": { |
| "type": "eval_ppl", |
| "configs": equiv_mxq_configs, |
| }, |
| } |
| do_expermient_fdata("quant_mxq_compromise", models, quant_tasks) |
| do_expermient_fdata("eval_mxq_compromise", models, ppl_tasks) |
|
|
|
|
| |
| |
| |
|
|
|
|
| def experiment_fp16_llama3_8B_OOM(): |
| models = ALL_MODELS[-1:] |
| type = "eval_ppl" |
| algo = "fp16" |
| tasks = { |
| algo: { |
| "type": type, |
| "configs": [ |
| ("base", {}), |
| ], |
| }, |
| } |
| do_expermient_fdata( |
| f"{type}_llama3_8B_OOM_{algo}", |
| models, |
| tasks, |
| track_cuda_memory=True, |
| ) |
|
|
|
|
| def experiment_fp16_vs_hqq_eval_gpu_mem(): |
| models = ALL_MODELS[-1:] |
| type = "eval_ppl" |
| algo = "fp16" |
| tasks = { |
| algo: { |
| "type": type, |
| "configs": [ |
| ("base", {}), |
| ], |
| }, |
| } |
| do_expermient_fdata("experiment_fp16_vs_hqq_eval_gpu_mem", models, tasks) |
| algo = "hqq" |
| tasks = { |
| algo: { |
| "type": type, |
| "configs": HQQ_CONFIGS[1:2], |
| }, |
| } |
| do_expermient_fdata("experiment_fp16_vs_hqq_eval_gpu_mem", models, tasks) |
|
|
|
|
| def experiment_eval_model_storage(): |
| models = ALL_MODELS |
| type = "eval_model_storage" |
| tasks = { |
| "fp16": { |
| "type": type, |
| "configs": [ |
| ("base", {}), |
| ], |
| }, |
| "mxq": { |
| "type": type, |
| "configs": MXQ_CONFIGS, |
| }, |
| "hqq": { |
| "type": type, |
| "configs": HQQ_CONFIGS, |
| }, |
| "awq": { |
| "type": type, |
| "configs": AUTOAWQ_CONFIGS, |
| }, |
| "gptq": {"type": type, "configs": GPTQ_CONFIGS}, |
| } |
| for i in range(5): |
| do_expermient_fdata(f"eval_model_storge_{i}", models, tasks) |
|
|
|
|
| def experiment_eval_ppl_all(): |
| models = ALL_MODELS |
| type = "eval_ppl" |
| tasks = { |
| "fp16": { |
| "type": type, |
| "configs": [ |
| ("base", {}), |
| ], |
| }, |
| "mxq": { |
| "type": type, |
| "configs": MXQ_CONFIGS, |
| }, |
| "hqq": { |
| "type": type, |
| "configs": HQQ_CONFIGS, |
| }, |
| "awq": { |
| "type": type, |
| "configs": AUTOAWQ_CONFIGS, |
| }, |
| "gptq": {"type": type, "configs": GPTQ_CONFIGS}, |
| } |
| do_expermient_fdata("experiment_eval_ppl_all", models, tasks) |
|
|
|
|
| def experiment_debug_quant_hqq(): |
| models = [ALL_MODELS[1]] |
| type = "eval_model_storage" |
| algo = "hqq" |
| tasks = { |
| algo: { |
| "type": type, |
| "configs": HQQ_CONFIGS[1:2], |
| }, |
| } |
|
|
| do_expermient( |
| f"debug_{type}_{algo}", |
| models, |
| tasks, |
| quant_dir="/fdata/llm/mxq/snapshots-debug", |
| result_dir="/fdata/llm/mxq/results", |
| ) |
|
|
|
|
| def main(): |
| logging.basicConfig( |
| format="%(asctime)s %(levelname)s [%(name)s] %(message)s", |
| level=logging.INFO, |
| datefmt="%Y-%m-%d %H:%M:%S", |
| ) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| experiment_debug_quant_hqq() |
|
|
|
|
| if __name__ == "__main__": |
| |
|
|
| max_threads = str(min(8, os.cpu_count())) |
| os.environ["OMP_NUM_THREADS"] = max_threads |
| os.environ["OPENBLAS_NUM_THREADS"] = max_threads |
| os.environ["MKL_NUM_THREADS"] = max_threads |
| os.environ["VECLIB_MAXIMUM_THREADS"] = max_threads |
| os.environ["NUMEXPR_NUM_THREADS"] = max_threads |
| os.environ["NUMEXPR_MAX_THREADS"] = max_threads |
| os.environ["HF_HOME"] = "/data/hugginface/" |
|
|
| main() |
|
|