import copy import logging import os # from adapter.awq import create_awq_model # from adapter.awq import quantize_awq_model from datetime import datetime from pathlib import Path import pandas as pd import torch from auto_gptq import BaseQuantizeConfig as GPTQQuantConfig from hqq.core.quantize import BaseQuantizeConfig as HQQQuantConfig from transformers import BitsAndBytesConfig from lm_quant_toolkit.adapter.autoawq import ( create_autoawq_model, quantize_autoawq_model, ) from lm_quant_toolkit.adapter.autogptq import ( create_autogptq_model, quantize_autogptq_model, ) from lm_quant_toolkit.adapter.bnb import create_bnb_model, quantize_bnb_model from lm_quant_toolkit.adapter.fp16 import create_fp16_model from lm_quant_toolkit.adapter.hqq import create_hqq_model, quantize_hqq_model from lm_quant_toolkit.adapter.mxq import create_mxq_model, quantize_mxq_model from lm_quant_toolkit.eval.common import ( HQQ_CONFIGS, _dump_cuda_mem_snapshot, _reset_peak_memory_stats, cleanup, combine_metrics, get_memory_metrics, get_mxq_quant_meta_data_file, persist_progress, save_partial_metric, ) from lm_quant_toolkit.eval.leaderboard import eval_llm_leaderboard from lm_quant_toolkit.eval.perplexity import eval_ppls ALL_MODELS = [ "meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-13b-hf", "meta-llama/Meta-Llama-3-8B", "meta-llama/Llama-3.1-8B", ] MXQ_CONFIGS = [ ( f"{bits:.2f}".replace(".", "_"), HQQQuantConfig(mixed=True, budget=bits, quant_scale=True), ) for bits in [5.00, 4.75, 4.50, 4.25, 4.01, 3.76, 3.50, 3.00, 2.75, 2.48] ] BNB_CONFIGS = [ ( "b4g64", BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True, ), ), ( "b8g128", BitsAndBytesConfig( load_in_8bit=True, bnb_4bit_compute_dtype="float16", ), ), ] AUTOAWQ_CONFIGS = [ ("b4g32", {"w_bit": 4, "q_group_size": 32, "zero_point": True, "version": "GEMM"}), ("b4g64", {"w_bit": 4, "q_group_size": 64, "zero_point": True, "version": "GEMM"}), ( "b4g128", {"w_bit": 4, "q_group_size": 128, "zero_point": True, "version": "GEMM"}, ), # 3-bit not supported by AutoAWQ right now # ("b3g64", {"w_bit": 3, "q_group_size": 64, "zero_point": True, 'version':'gemv_fast'}), # ("b3g128", {"w_bit": 3, "q_group_size": 128, "zero_point": True, 'version':'gemv_fast'}), ] GPTQ_CONFIGS = [ ( "b8g32", GPTQQuantConfig(bits=8, group_size=32, damp_percent=0.01, desc_act=False), ), ( "b8g64", GPTQQuantConfig(bits=8, group_size=64, damp_percent=0.01, desc_act=False), ), ( "b8g128", GPTQQuantConfig(bits=8, group_size=128, damp_percent=0.01, desc_act=False), ), ( "b4g32", GPTQQuantConfig(bits=4, group_size=32, damp_percent=0.01, desc_act=False), ), ( "b4g64", GPTQQuantConfig(bits=4, group_size=64, damp_percent=0.01, desc_act=False), ), ( "b4g128", GPTQQuantConfig(bits=4, group_size=128, damp_percent=0.01, desc_act=False), ), ( "b3g32", GPTQQuantConfig(bits=3, group_size=32, damp_percent=0.01, desc_act=False), ), ( "b3g64", GPTQQuantConfig(bits=3, group_size=64, damp_percent=0.01, desc_act=False), ), ( "b3g128", GPTQQuantConfig(bits=3, group_size=128, damp_percent=0.01, desc_act=False), ), ] def gen_experiment_items(models, tasks): dikts = [] for algo, spec in tasks.items(): configs = spec["configs"] for config in configs: for model_id in models: dikts.append( { "model": model_id, "cfg": config[0], "task_type": spec["type"], "algo": algo, } ) return pd.DataFrame(dikts) def _setup_fn(algo, spec): match algo: case "fp16": spec["create_fn"] = create_fp16_model spec["quantize_fn"] = None case "awq": spec["create_fn"] = create_autoawq_model spec["quantize_fn"] = quantize_autoawq_model case "gptq": spec["create_fn"] = create_autogptq_model spec["quantize_fn"] = quantize_autogptq_model case "hqq": spec["create_fn"] = create_hqq_model spec["quantize_fn"] = quantize_hqq_model case "bnb": spec["create_fn"] = create_bnb_model spec["quantize_fn"] = quantize_bnb_model case "mxq": spec["create_fn"] = create_mxq_model spec["quantize_fn"] = quantize_mxq_model case _: raise ValueError(f"Invalid algo: {algo}") def do_expermient( experiment_name, models, tasks, quant_dir="snapshots", result_dir="results", log_dir="logs", track_cuda_memory=False, **kwargs, ): df_all = gen_experiment_items(models, tasks) progress_path = os.path.join(result_dir, experiment_name, "progress.csv") if Path(progress_path).exists(): df_saved = pd.read_csv(progress_path) df_all = df_all.merge( df_saved, how="left", on=["model", "cfg", "task_type", "algo"] ) # filter already processed repos, equivalent to SQL is null df_todo = df_all.query("status != status or status != 1") else: df_all["status"] = 0 df_all["completion_time"] = "" df_todo = df_all print("*" * 72) print("Sub-task list:") print(df_all) cnt_todo, cnt_tot = len(df_todo), len(df_all) print(f"Todo:{cnt_todo}, Done: {cnt_tot - cnt_todo}, Total: {cnt_tot}") if cnt_todo == 0: print("Tasks completed!") print("*" * 72) if cnt_todo == 0: return df_todo = df_todo.sort_values(by=["model", "cfg"], ascending=False) for idx, row in df_todo.iterrows(): model_id = row["model"] algo = row["algo"] task_type = row["task_type"] cfg = row["cfg"] spec = tasks[algo] _setup_fn(algo, spec) config = [c for c in spec["configs"] if c[0] == cfg][0] quant_fn = spec["quantize_fn"] metric = _init_metrics(model_id, algo, config) print("*" * 72) if task_type == "quant": print(f"Quantizing {algo} on {model_id} w/ config: {cfg}...") elif task_type == "eval_ppl": print(f"Evaluating {algo} PPL on {model_id} w/ config: {cfg}...") elif task_type == "eval_leaderboard": print( f"Evaluating {algo} LLM Leaderboard benchmarks on {model_id} w/ config: {cfg}..." ) else: print( f"Evaluating {algo} model storage metrics on {model_id} w/ config: {cfg}..." ) print("*" * 72) if track_cuda_memory: torch.cuda.memory._record_memory_history() _reset_peak_memory_stats() if task_type != "eval_leaderboard": create_fn = spec["create_fn"] model, tokenizer, quantized, model_file_size = create_fn( model_id, config[1], cfg, quant_fn is not None, quant_dir ) if not quantized and quant_fn: # avoid interventions between models quant_config = copy.deepcopy(config[1]) if algo == "mxq": ok, metric_fp = get_mxq_quant_meta_data_file(model_id) if not ok: print( f"Quantization meta data file: {metric_fp} doesn't exists!" ) return quant_config["quant_metrics_file"] = metric_fp quant_config["weight_algo"] = kwargs.get("weight_algo", None) quant_config["boost_layers"] = kwargs.get("boost_layers", None) quant_config["decline_layers"] = kwargs.get("decline_layers", None) quant_config["boost_stop"] = kwargs.get("boost_stop", None) quant_config["decline_stop"] = kwargs.get("decline_stop", None) quant_config["ablation"] = kwargs.get("ablation", None) quant_config["top_m_layer"] = kwargs.get("top_m_layer", None) quant_config["factor"] = kwargs.get("factor", None) model, duration, model_file_size = quant_fn( model, tokenizer, quant_config, model_id, cfg, quant_dir, ) metric["quant_duration"] = duration if task_type == "eval_model_storage": allot, reserved = get_memory_metrics() metric["load_mem_allot"] = allot metric["load_mem_reserved"] = reserved metric["model_storage_size"] = model_file_size elif task_type == "eval_ppl": # Evaluate the quantized model metric = eval_ppls(model, tokenizer, metric) metric["ppl_mem_allot"], metric["ppl_mem_reserved"] = ( get_memory_metrics() ) if track_cuda_memory: _dump_cuda_mem_snapshot(experiment_name, model_id, algo, result_dir) cleanup(model) else: metric = eval_llm_leaderboard( experiment_name, model_id, algo, cfg, quant_fn is not None, metric, quant_dir, result_dir, ) metric["leaderboard_mem_allot"], metric["leaderboard_mem_reserved"] = ( get_memory_metrics() ) if track_cuda_memory: _dump_cuda_mem_snapshot(experiment_name, model_id, algo, result_dir) save_partial_metric(experiment_name, algo, model_id, cfg, metric, result_dir) df_all.loc[ (df_all["model"] == model_id) & (df_all["cfg"] == cfg) & (df_all["algo"] == algo) & (df_all["task_type"] == task_type), ["status", "completion_time"], ] = 1, datetime.now().strftime("%Y-%m-%d %H:%M:%S") persist_progress(df_all, progress_path) # combine metrics combine_metrics(experiment_name, result_dir) def _init_metrics(model_id, algo, config): return { "model": model_id.split("/")[1], "algo": algo, "config": config[0], "config_detail": str(config[1]).replace("\n", ""), "quant_duration": 0, "model_storage_size": 0, "load_mem_allot": 0, "load_mem_reserved": 0, "ppl_mem_allot": 0, "ppl_mem_reserved": 0, "leaderboard_mem_allot": 0, "leaderboard_mem_reserved": 0, "quant_duration": 0, "ppl_wikitext": 0, "ppl_c4": 0, "duration_wikitext": 0, "duration_c4": 0, "duration_leaderboard": 0, "ifeval": 0, "bbh": 0, "mathlevel5": 0, "gpqa": 0, "musr": 0, "mmlupro": 0, } def do_expermient_fdata( experiment_name, models, tasks, track_cuda_memory=False, ): do_expermient( experiment_name, models, tasks, quant_dir="/fdata/llm/mxq/snapshots", result_dir="/fdata/llm/mxq/results", log_dir="/fdata/llm/mxq/logs", track_cuda_memory=track_cuda_memory, ) ######################################################################## # Quantization experiments ######################################################################## def experiment_quant_hqq(): models = ALL_MODELS tasks = { "hqq": { "type": "quant", "configs": HQQ_CONFIGS, }, } do_expermient_fdata( "quant_hqq", models, tasks, ) def experiment_quant_mxq(): models = ALL_MODELS type = "quant" algo = "mxq" tasks = { algo: { "type": type, "configs": MXQ_CONFIGS, }, } do_expermient_fdata(f"{type}_{algo}_mxq", models, tasks) def experiment_quant_awq(): # models = [ALL_MODELS[0], ALL_MODELS[2]] models = [ALL_MODELS[1]] type = "quant" algo = "awq" tasks = { algo: { "type": type, "configs": AUTOAWQ_CONFIGS, }, } do_expermient_fdata(f"{type}_{algo}", models, tasks) def experiment_quant_gptq(): models = ALL_MODELS type = "quant" algo = "gptq" tasks = { algo: { "type": type, "configs": GPTQ_CONFIGS, }, } do_expermient_fdata(f"{type}_{algo}", models, tasks) def experiment_quantize_405B(): models = [ "meta-llama/Meta-Llama-3.1-405B-Instruct", ] tasks = { "hqq": { "type": "quant", "configs": HQQ_CONFIGS[1:2], }, } do_expermient( "quant_hqq_405B", models, tasks, quant_dir="/data/gqq-eval/snapshots/", ) ######################################################################## # Perplexity evaluation experiments ######################################################################## def experiment_ppl_eval_fp16(): models = ALL_MODELS type = "eval_ppl" algo = "fp16" tasks = { algo: { "type": type, "configs": [ ("base", {}), ], }, } do_expermient_fdata(f"{type}_{algo}", models, tasks) def experiment_ppl_eval_awq(): models = [ALL_MODELS[0], ALL_MODELS[2]] type = "eval_ppl" algo = "awq" tasks = { algo: { "type": type, "configs": AUTOAWQ_CONFIGS[1:2], }, } do_expermient_fdata(f"{type}_{algo}", models, tasks) def experiment_ppl_eval_gptq(): models = ALL_MODELS type = "eval_ppl" algo = "gptq" tasks = { algo: { "type": type, "configs": GPTQ_CONFIGS, }, } do_expermient_fdata(f"{type}_{algo}", models, tasks) def experiment_ppl_eval_hqq(): models = ALL_MODELS type = "eval_ppl" algo = "hqq" tasks = { algo: { "type": type, "configs": HQQ_CONFIGS, }, } do_expermient_fdata(f"{type}_{algo}", models, tasks) ######################################################################## # Open LLM Leaderboard evaluation experiments ######################################################################## def experiment_llm_leaderboard_fp16(): models = [ALL_MODELS[0], ALL_MODELS[2]] type = "eval_leaderboard" algo = "fp16" tasks = { algo: { "type": type, "configs": [ ("base", {}), ], }, } do_expermient_fdata(f"{type}_{algo}", models, tasks) def experiment_llm_leaderboard_autogptq(): models = ALL_MODELS type = "eval_leaderboard" algo = "gptq" tasks = { algo: { "type": type, "configs": GPTQ_CONFIGS, }, } do_expermient_fdata(f"{type}_{algo}", models, tasks) def experiment_llm_leaderboard_hqq(): models = ALL_MODELS type = "eval_leaderboard" algo = "hqq" tasks = { algo: { "type": type, "configs": HQQ_CONFIGS, }, } do_expermient_fdata(f"{type}_{algo}", models, tasks) def experiment_llm_leaderboard_mxq(): models = ALL_MODELS type = "eval_leaderboard" algo = "mxq" tasks = { algo: { "type": type, "configs": MXQ_CONFIGS, }, } do_expermient_fdata(f"{type}_{algo}", models, tasks) def experiment_llm_leaderboard_autoawq(): models = ALL_MODELS[0:1] type = "eval_leaderboard" algo = "awq" tasks = { algo: { "type": type, "configs": AUTOAWQ_CONFIGS[0:2], }, } do_expermient_fdata(f"{type}_{algo}", models, tasks) ######################################################################## # Mixed Quant Eval experiments ######################################################################## def experiment_quant_ppl_eval_mxq_comprise(): models = ALL_MODELS equiv_mxq_configs = [] nbits = [4.06, 4.10, 4.15, 4.19, 4.24, 4.28, 4.33] for bits in nbits: cfg_name = f"mxq-{str(bits).replace('.', '_')}" equiv_mxq_configs.append( (cfg_name, HQQQuantConfig(mixed=True, budget=bits, quant_scale=True)) ) quant_tasks = { "hqq": { "type": "quant", "configs": equiv_mxq_configs, }, } ppl_tasks = { "hqq": { "type": "eval_ppl", "configs": equiv_mxq_configs, }, } do_expermient_fdata("quant_mxq_compromise", models, quant_tasks) do_expermient_fdata("eval_mxq_compromise", models, ppl_tasks) ######################################################################## # Misc experiments ######################################################################## def experiment_fp16_llama3_8B_OOM(): models = ALL_MODELS[-1:] type = "eval_ppl" algo = "fp16" tasks = { algo: { "type": type, "configs": [ ("base", {}), ], }, } do_expermient_fdata( f"{type}_llama3_8B_OOM_{algo}", models, tasks, track_cuda_memory=True, ) def experiment_fp16_vs_hqq_eval_gpu_mem(): models = ALL_MODELS[-1:] type = "eval_ppl" algo = "fp16" tasks = { algo: { "type": type, "configs": [ ("base", {}), ], }, } do_expermient_fdata("experiment_fp16_vs_hqq_eval_gpu_mem", models, tasks) algo = "hqq" tasks = { algo: { "type": type, "configs": HQQ_CONFIGS[1:2], }, } do_expermient_fdata("experiment_fp16_vs_hqq_eval_gpu_mem", models, tasks) def experiment_eval_model_storage(): models = ALL_MODELS type = "eval_model_storage" tasks = { "fp16": { "type": type, "configs": [ ("base", {}), ], }, "mxq": { "type": type, "configs": MXQ_CONFIGS, }, "hqq": { "type": type, "configs": HQQ_CONFIGS, }, "awq": { "type": type, "configs": AUTOAWQ_CONFIGS, }, "gptq": {"type": type, "configs": GPTQ_CONFIGS}, } for i in range(5): do_expermient_fdata(f"eval_model_storge_{i}", models, tasks) def experiment_eval_ppl_all(): models = ALL_MODELS type = "eval_ppl" tasks = { "fp16": { "type": type, "configs": [ ("base", {}), ], }, "mxq": { "type": type, "configs": MXQ_CONFIGS, }, "hqq": { "type": type, "configs": HQQ_CONFIGS, }, "awq": { "type": type, "configs": AUTOAWQ_CONFIGS, }, "gptq": {"type": type, "configs": GPTQ_CONFIGS}, } do_expermient_fdata("experiment_eval_ppl_all", models, tasks) def experiment_debug_quant_hqq(): models = [ALL_MODELS[1]] type = "eval_model_storage" algo = "hqq" tasks = { algo: { "type": type, "configs": HQQ_CONFIGS[1:2], }, } do_expermient( f"debug_{type}_{algo}", models, tasks, quant_dir="/fdata/llm/mxq/snapshots-debug", result_dir="/fdata/llm/mxq/results", ) def main(): logging.basicConfig( format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S", ) # experiment_llm_leaderboard_autogptq() # experiment_llm_leaderboard_fp16() # experiment_llm_leaderboard_hqq() # experiment_llm_leaderboard_autoawq() # experiment_quant_hqq() # experiment_quant_mxq() # experiment_quant_awq() # experiment_quant_gptq() # experiment_ppl_eval_fp16() # experiment_ppl_eval_hqq() # experiment_ppl_eval_gptq() # experiment_ppl_eval_awq() # experiment_fp16_llama3_8B_OOM() # experiment_fp16_vs_hqq_eval_gpu_mem() # experiment_debug_quant_hqq() # experiment_eval_model_storage() # experiment_eval_ppl_all() experiment_debug_quant_hqq() if __name__ == "__main__": # os.environ['HF_DATASETS_OFFLINE'] = '1' max_threads = str(min(8, os.cpu_count())) os.environ["OMP_NUM_THREADS"] = max_threads os.environ["OPENBLAS_NUM_THREADS"] = max_threads os.environ["MKL_NUM_THREADS"] = max_threads os.environ["VECLIB_MAXIMUM_THREADS"] = max_threads os.environ["NUMEXPR_NUM_THREADS"] = max_threads os.environ["NUMEXPR_MAX_THREADS"] = max_threads os.environ["HF_HOME"] = "/data/hugginface/" main()