Add files using upload-large-folder tool

64ddf8d verified 6 months ago

21.4 kB

	import copy
	import logging
	import os

	# from adapter.awq import create_awq_model
	# from adapter.awq import quantize_awq_model
	from datetime import datetime
	from pathlib import Path

	import pandas as pd
	import torch
	from auto_gptq import BaseQuantizeConfig as GPTQQuantConfig
	from hqq.core.quantize import BaseQuantizeConfig as HQQQuantConfig
	from transformers import BitsAndBytesConfig

	from lm_quant_toolkit.adapter.autoawq import (
	create_autoawq_model,
	quantize_autoawq_model,
	)
	from lm_quant_toolkit.adapter.autogptq import (
	create_autogptq_model,
	quantize_autogptq_model,
	)
	from lm_quant_toolkit.adapter.bnb import create_bnb_model, quantize_bnb_model
	from lm_quant_toolkit.adapter.fp16 import create_fp16_model
	from lm_quant_toolkit.adapter.hqq import create_hqq_model, quantize_hqq_model
	from lm_quant_toolkit.adapter.mxq import create_mxq_model, quantize_mxq_model
	from lm_quant_toolkit.eval.common import (
	HQQ_CONFIGS,
	_dump_cuda_mem_snapshot,
	_reset_peak_memory_stats,
	cleanup,
	combine_metrics,
	get_memory_metrics,
	get_mxq_quant_meta_data_file,
	persist_progress,
	save_partial_metric,
	)
	from lm_quant_toolkit.eval.leaderboard import eval_llm_leaderboard
	from lm_quant_toolkit.eval.perplexity import eval_ppls

	ALL_MODELS = [
	"meta-llama/Llama-2-7b-hf",
	"meta-llama/Llama-2-13b-hf",
	"meta-llama/Meta-Llama-3-8B",
	"meta-llama/Llama-3.1-8B",
	]

	MXQ_CONFIGS = [
	(
	f"{bits:.2f}".replace(".", "_"),
	HQQQuantConfig(mixed=True, budget=bits, quant_scale=True),
	)
	for bits in [5.00, 4.75, 4.50, 4.25, 4.01, 3.76, 3.50, 3.00, 2.75, 2.48]
	]

	BNB_CONFIGS = [
	(
	"b4g64",
	BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype="float16",
	bnb_4bit_use_double_quant=True,
	),
	),
	(
	"b8g128",
	BitsAndBytesConfig(
	load_in_8bit=True,
	bnb_4bit_compute_dtype="float16",
	),
	),
	]


	AUTOAWQ_CONFIGS = [
	("b4g32", {"w_bit": 4, "q_group_size": 32, "zero_point": True, "version": "GEMM"}),
	("b4g64", {"w_bit": 4, "q_group_size": 64, "zero_point": True, "version": "GEMM"}),
	(
	"b4g128",
	{"w_bit": 4, "q_group_size": 128, "zero_point": True, "version": "GEMM"},
	),
	# 3-bit not supported by AutoAWQ right now
	# ("b3g64", {"w_bit": 3, "q_group_size": 64, "zero_point": True, 'version':'gemv_fast'}),
	# ("b3g128", {"w_bit": 3, "q_group_size": 128, "zero_point": True, 'version':'gemv_fast'}),
	]

	GPTQ_CONFIGS = [
	(
	"b8g32",
	GPTQQuantConfig(bits=8, group_size=32, damp_percent=0.01, desc_act=False),
	),
	(
	"b8g64",
	GPTQQuantConfig(bits=8, group_size=64, damp_percent=0.01, desc_act=False),
	),
	(
	"b8g128",
	GPTQQuantConfig(bits=8, group_size=128, damp_percent=0.01, desc_act=False),
	),
	(
	"b4g32",
	GPTQQuantConfig(bits=4, group_size=32, damp_percent=0.01, desc_act=False),
	),
	(
	"b4g64",
	GPTQQuantConfig(bits=4, group_size=64, damp_percent=0.01, desc_act=False),
	),
	(
	"b4g128",
	GPTQQuantConfig(bits=4, group_size=128, damp_percent=0.01, desc_act=False),
	),
	(
	"b3g32",
	GPTQQuantConfig(bits=3, group_size=32, damp_percent=0.01, desc_act=False),
	),
	(
	"b3g64",
	GPTQQuantConfig(bits=3, group_size=64, damp_percent=0.01, desc_act=False),
	),
	(
	"b3g128",
	GPTQQuantConfig(bits=3, group_size=128, damp_percent=0.01, desc_act=False),
	),
	]


	def gen_experiment_items(models, tasks):
	dikts = []
	for algo, spec in tasks.items():
	configs = spec["configs"]
	for config in configs:
	for model_id in models:
	dikts.append(
	{
	"model": model_id,
	"cfg": config[0],
	"task_type": spec["type"],
	"algo": algo,
	}
	)
	return pd.DataFrame(dikts)


	def _setup_fn(algo, spec):
	match algo:
	case "fp16":
	spec["create_fn"] = create_fp16_model
	spec["quantize_fn"] = None
	case "awq":
	spec["create_fn"] = create_autoawq_model
	spec["quantize_fn"] = quantize_autoawq_model
	case "gptq":
	spec["create_fn"] = create_autogptq_model
	spec["quantize_fn"] = quantize_autogptq_model
	case "hqq":
	spec["create_fn"] = create_hqq_model
	spec["quantize_fn"] = quantize_hqq_model
	case "bnb":
	spec["create_fn"] = create_bnb_model
	spec["quantize_fn"] = quantize_bnb_model
	case "mxq":
	spec["create_fn"] = create_mxq_model
	spec["quantize_fn"] = quantize_mxq_model
	case _:
	raise ValueError(f"Invalid algo: {algo}")


	def do_expermient(
	experiment_name,
	models,
	tasks,
	quant_dir="snapshots",
	result_dir="results",
	log_dir="logs",
	track_cuda_memory=False,
	**kwargs,
	):
	df_all = gen_experiment_items(models, tasks)
	progress_path = os.path.join(result_dir, experiment_name, "progress.csv")
	if Path(progress_path).exists():
	df_saved = pd.read_csv(progress_path)
	df_all = df_all.merge(
	df_saved, how="left", on=["model", "cfg", "task_type", "algo"]
	)
	# filter already processed repos, equivalent to SQL is null
	df_todo = df_all.query("status != status or status != 1")
	else:
	df_all["status"] = 0
	df_all["completion_time"] = ""
	df_todo = df_all
	print("" 72)
	print("Sub-task list:")
	print(df_all)
	cnt_todo, cnt_tot = len(df_todo), len(df_all)
	print(f"Todo:{cnt_todo}, Done: {cnt_tot - cnt_todo}, Total: {cnt_tot}")
	if cnt_todo == 0:
	print("Tasks completed!")
	print("" 72)
	if cnt_todo == 0:
	return

	df_todo = df_todo.sort_values(by=["model", "cfg"], ascending=False)
	for idx, row in df_todo.iterrows():
	model_id = row["model"]
	algo = row["algo"]
	task_type = row["task_type"]
	cfg = row["cfg"]
	spec = tasks[algo]
	_setup_fn(algo, spec)
	config = [c for c in spec["configs"] if c[0] == cfg][0]
	quant_fn = spec["quantize_fn"]
	metric = _init_metrics(model_id, algo, config)
	print("" 72)
	if task_type == "quant":
	print(f"Quantizing {algo} on {model_id} w/ config: {cfg}...")
	elif task_type == "eval_ppl":
	print(f"Evaluating {algo} PPL on {model_id} w/ config: {cfg}...")
	elif task_type == "eval_leaderboard":
	print(
	f"Evaluating {algo} LLM Leaderboard benchmarks on {model_id} w/ config: {cfg}..."
	)
	else:
	print(
	f"Evaluating {algo} model storage metrics on {model_id} w/ config: {cfg}..."
	)
	print("" 72)

	if track_cuda_memory:
	torch.cuda.memory._record_memory_history()
	_reset_peak_memory_stats()
	if task_type != "eval_leaderboard":
	create_fn = spec["create_fn"]
	model, tokenizer, quantized, model_file_size = create_fn(
	model_id, config[1], cfg, quant_fn is not None, quant_dir
	)

	if not quantized and quant_fn:
	# avoid interventions between models
	quant_config = copy.deepcopy(config[1])
	if algo == "mxq":
	ok, metric_fp = get_mxq_quant_meta_data_file(model_id)
	if not ok:
	print(
	f"Quantization meta data file: {metric_fp} doesn't exists!"
	)
	return
	quant_config["quant_metrics_file"] = metric_fp
	quant_config["weight_algo"] = kwargs.get("weight_algo", None)
	quant_config["boost_layers"] = kwargs.get("boost_layers", None)
	quant_config["decline_layers"] = kwargs.get("decline_layers", None)
	quant_config["boost_stop"] = kwargs.get("boost_stop", None)
	quant_config["decline_stop"] = kwargs.get("decline_stop", None)
	quant_config["ablation"] = kwargs.get("ablation", None)
	quant_config["top_m_layer"] = kwargs.get("top_m_layer", None)
	quant_config["factor"] = kwargs.get("factor", None)
	model, duration, model_file_size = quant_fn(
	model,
	tokenizer,
	quant_config,
	model_id,
	cfg,
	quant_dir,
	)
	metric["quant_duration"] = duration
	if task_type == "eval_model_storage":
	allot, reserved = get_memory_metrics()
	metric["load_mem_allot"] = allot
	metric["load_mem_reserved"] = reserved
	metric["model_storage_size"] = model_file_size

	elif task_type == "eval_ppl":
	# Evaluate the quantized model
	metric = eval_ppls(model, tokenizer, metric)
	metric["ppl_mem_allot"], metric["ppl_mem_reserved"] = (
	get_memory_metrics()
	)
	if track_cuda_memory:
	_dump_cuda_mem_snapshot(experiment_name, model_id, algo, result_dir)
	cleanup(model)
	else:
	metric = eval_llm_leaderboard(
	experiment_name,
	model_id,
	algo,
	cfg,
	quant_fn is not None,
	metric,
	quant_dir,
	result_dir,
	)
	metric["leaderboard_mem_allot"], metric["leaderboard_mem_reserved"] = (
	get_memory_metrics()
	)
	if track_cuda_memory:
	_dump_cuda_mem_snapshot(experiment_name, model_id, algo, result_dir)
	save_partial_metric(experiment_name, algo, model_id, cfg, metric, result_dir)
	df_all.loc[
	(df_all["model"] == model_id)
	& (df_all["cfg"] == cfg)
	& (df_all["algo"] == algo)
	& (df_all["task_type"] == task_type),
	["status", "completion_time"],
	] = 1, datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	persist_progress(df_all, progress_path)
	# combine metrics
	combine_metrics(experiment_name, result_dir)


	def _init_metrics(model_id, algo, config):
	return {
	"model": model_id.split("/")[1],
	"algo": algo,
	"config": config[0],
	"config_detail": str(config[1]).replace("\n", ""),
	"quant_duration": 0,
	"model_storage_size": 0,
	"load_mem_allot": 0,
	"load_mem_reserved": 0,
	"ppl_mem_allot": 0,
	"ppl_mem_reserved": 0,
	"leaderboard_mem_allot": 0,
	"leaderboard_mem_reserved": 0,
	"quant_duration": 0,
	"ppl_wikitext": 0,
	"ppl_c4": 0,
	"duration_wikitext": 0,
	"duration_c4": 0,
	"duration_leaderboard": 0,
	"ifeval": 0,
	"bbh": 0,
	"mathlevel5": 0,
	"gpqa": 0,
	"musr": 0,
	"mmlupro": 0,
	}


	def do_expermient_fdata(
	experiment_name,
	models,
	tasks,
	track_cuda_memory=False,
	):
	do_expermient(
	experiment_name,
	models,
	tasks,
	quant_dir="/fdata/llm/mxq/snapshots",
	result_dir="/fdata/llm/mxq/results",
	log_dir="/fdata/llm/mxq/logs",
	track_cuda_memory=track_cuda_memory,
	)


	########################################################################
	# Quantization experiments
	########################################################################


	def experiment_quant_hqq():
	models = ALL_MODELS
	tasks = {
	"hqq": {
	"type": "quant",
	"configs": HQQ_CONFIGS,
	},
	}
	do_expermient_fdata(
	"quant_hqq",
	models,
	tasks,
	)


	def experiment_quant_mxq():
	models = ALL_MODELS
	type = "quant"
	algo = "mxq"
	tasks = {
	algo: {
	"type": type,
	"configs": MXQ_CONFIGS,
	},
	}
	do_expermient_fdata(f"{type}_{algo}_mxq", models, tasks)


	def experiment_quant_awq():
	# models = [ALL_MODELS[0], ALL_MODELS[2]]
	models = [ALL_MODELS[1]]
	type = "quant"
	algo = "awq"
	tasks = {
	algo: {
	"type": type,
	"configs": AUTOAWQ_CONFIGS,
	},
	}
	do_expermient_fdata(f"{type}_{algo}", models, tasks)


	def experiment_quant_gptq():
	models = ALL_MODELS
	type = "quant"
	algo = "gptq"
	tasks = {
	algo: {
	"type": type,
	"configs": GPTQ_CONFIGS,
	},
	}
	do_expermient_fdata(f"{type}_{algo}", models, tasks)


	def experiment_quantize_405B():
	models = [
	"meta-llama/Meta-Llama-3.1-405B-Instruct",
	]

	tasks = {
	"hqq": {
	"type": "quant",
	"configs": HQQ_CONFIGS[1:2],
	},
	}
	do_expermient(
	"quant_hqq_405B",
	models,
	tasks,
	quant_dir="/data/gqq-eval/snapshots/",
	)


	########################################################################
	# Perplexity evaluation experiments
	########################################################################


	def experiment_ppl_eval_fp16():
	models = ALL_MODELS
	type = "eval_ppl"
	algo = "fp16"
	tasks = {
	algo: {
	"type": type,
	"configs": [
	("base", {}),
	],
	},
	}
	do_expermient_fdata(f"{type}_{algo}", models, tasks)


	def experiment_ppl_eval_awq():
	models = [ALL_MODELS[0], ALL_MODELS[2]]
	type = "eval_ppl"
	algo = "awq"
	tasks = {
	algo: {
	"type": type,
	"configs": AUTOAWQ_CONFIGS[1:2],
	},
	}
	do_expermient_fdata(f"{type}_{algo}", models, tasks)


	def experiment_ppl_eval_gptq():
	models = ALL_MODELS
	type = "eval_ppl"
	algo = "gptq"
	tasks = {
	algo: {
	"type": type,
	"configs": GPTQ_CONFIGS,
	},
	}
	do_expermient_fdata(f"{type}_{algo}", models, tasks)


	def experiment_ppl_eval_hqq():
	models = ALL_MODELS
	type = "eval_ppl"
	algo = "hqq"
	tasks = {
	algo: {
	"type": type,
	"configs": HQQ_CONFIGS,
	},
	}
	do_expermient_fdata(f"{type}_{algo}", models, tasks)


	########################################################################
	# Open LLM Leaderboard evaluation experiments
	########################################################################


	def experiment_llm_leaderboard_fp16():
	models = [ALL_MODELS[0], ALL_MODELS[2]]
	type = "eval_leaderboard"
	algo = "fp16"
	tasks = {
	algo: {
	"type": type,
	"configs": [
	("base", {}),
	],
	},
	}
	do_expermient_fdata(f"{type}_{algo}", models, tasks)


	def experiment_llm_leaderboard_autogptq():
	models = ALL_MODELS
	type = "eval_leaderboard"
	algo = "gptq"
	tasks = {
	algo: {
	"type": type,
	"configs": GPTQ_CONFIGS,
	},
	}
	do_expermient_fdata(f"{type}_{algo}", models, tasks)


	def experiment_llm_leaderboard_hqq():
	models = ALL_MODELS
	type = "eval_leaderboard"
	algo = "hqq"
	tasks = {
	algo: {
	"type": type,
	"configs": HQQ_CONFIGS,
	},
	}
	do_expermient_fdata(f"{type}_{algo}", models, tasks)


	def experiment_llm_leaderboard_mxq():
	models = ALL_MODELS
	type = "eval_leaderboard"
	algo = "mxq"
	tasks = {
	algo: {
	"type": type,
	"configs": MXQ_CONFIGS,
	},
	}
	do_expermient_fdata(f"{type}_{algo}", models, tasks)


	def experiment_llm_leaderboard_autoawq():
	models = ALL_MODELS[0:1]
	type = "eval_leaderboard"
	algo = "awq"
	tasks = {
	algo: {
	"type": type,
	"configs": AUTOAWQ_CONFIGS[0:2],
	},
	}
	do_expermient_fdata(f"{type}_{algo}", models, tasks)


	########################################################################
	# Mixed Quant Eval experiments
	########################################################################


	def experiment_quant_ppl_eval_mxq_comprise():
	models = ALL_MODELS
	equiv_mxq_configs = []
	nbits = [4.06, 4.10, 4.15, 4.19, 4.24, 4.28, 4.33]
	for bits in nbits:
	cfg_name = f"mxq-{str(bits).replace('.', '_')}"
	equiv_mxq_configs.append(
	(cfg_name, HQQQuantConfig(mixed=True, budget=bits, quant_scale=True))
	)
	quant_tasks = {
	"hqq": {
	"type": "quant",
	"configs": equiv_mxq_configs,
	},
	}
	ppl_tasks = {
	"hqq": {
	"type": "eval_ppl",
	"configs": equiv_mxq_configs,
	},
	}
	do_expermient_fdata("quant_mxq_compromise", models, quant_tasks)
	do_expermient_fdata("eval_mxq_compromise", models, ppl_tasks)


	########################################################################
	# Misc experiments
	########################################################################


	def experiment_fp16_llama3_8B_OOM():
	models = ALL_MODELS[-1:]
	type = "eval_ppl"
	algo = "fp16"
	tasks = {
	algo: {
	"type": type,
	"configs": [
	("base", {}),
	],
	},
	}
	do_expermient_fdata(
	f"{type}_llama3_8B_OOM_{algo}",
	models,
	tasks,
	track_cuda_memory=True,
	)


	def experiment_fp16_vs_hqq_eval_gpu_mem():
	models = ALL_MODELS[-1:]
	type = "eval_ppl"
	algo = "fp16"
	tasks = {
	algo: {
	"type": type,
	"configs": [
	("base", {}),
	],
	},
	}
	do_expermient_fdata("experiment_fp16_vs_hqq_eval_gpu_mem", models, tasks)
	algo = "hqq"
	tasks = {
	algo: {
	"type": type,
	"configs": HQQ_CONFIGS[1:2],
	},
	}
	do_expermient_fdata("experiment_fp16_vs_hqq_eval_gpu_mem", models, tasks)


	def experiment_eval_model_storage():
	models = ALL_MODELS
	type = "eval_model_storage"
	tasks = {
	"fp16": {
	"type": type,
	"configs": [
	("base", {}),
	],
	},
	"mxq": {
	"type": type,
	"configs": MXQ_CONFIGS,
	},
	"hqq": {
	"type": type,
	"configs": HQQ_CONFIGS,
	},
	"awq": {
	"type": type,
	"configs": AUTOAWQ_CONFIGS,
	},
	"gptq": {"type": type, "configs": GPTQ_CONFIGS},
	}
	for i in range(5):
	do_expermient_fdata(f"eval_model_storge_{i}", models, tasks)


	def experiment_eval_ppl_all():
	models = ALL_MODELS
	type = "eval_ppl"
	tasks = {
	"fp16": {
	"type": type,
	"configs": [
	("base", {}),
	],
	},
	"mxq": {
	"type": type,
	"configs": MXQ_CONFIGS,
	},
	"hqq": {
	"type": type,
	"configs": HQQ_CONFIGS,
	},
	"awq": {
	"type": type,
	"configs": AUTOAWQ_CONFIGS,
	},
	"gptq": {"type": type, "configs": GPTQ_CONFIGS},
	}
	do_expermient_fdata("experiment_eval_ppl_all", models, tasks)


	def experiment_debug_quant_hqq():
	models = [ALL_MODELS[1]]
	type = "eval_model_storage"
	algo = "hqq"
	tasks = {
	algo: {
	"type": type,
	"configs": HQQ_CONFIGS[1:2],
	},
	}

	do_expermient(
	f"debug_{type}_{algo}",
	models,
	tasks,
	quant_dir="/fdata/llm/mxq/snapshots-debug",
	result_dir="/fdata/llm/mxq/results",
	)


	def main():
	logging.basicConfig(
	format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
	level=logging.INFO,
	datefmt="%Y-%m-%d %H:%M:%S",
	)

	# experiment_llm_leaderboard_autogptq()
	# experiment_llm_leaderboard_fp16()
	# experiment_llm_leaderboard_hqq()
	# experiment_llm_leaderboard_autoawq()
	# experiment_quant_hqq()
	# experiment_quant_mxq()
	# experiment_quant_awq()
	# experiment_quant_gptq()
	# experiment_ppl_eval_fp16()
	# experiment_ppl_eval_hqq()
	# experiment_ppl_eval_gptq()
	# experiment_ppl_eval_awq()
	# experiment_fp16_llama3_8B_OOM()
	# experiment_fp16_vs_hqq_eval_gpu_mem()
	# experiment_debug_quant_hqq()
	# experiment_eval_model_storage()
	# experiment_eval_ppl_all()
	experiment_debug_quant_hqq()


	if __name__ == "__main__":
	# os.environ['HF_DATASETS_OFFLINE'] = '1'

	max_threads = str(min(8, os.cpu_count()))
	os.environ["OMP_NUM_THREADS"] = max_threads
	os.environ["OPENBLAS_NUM_THREADS"] = max_threads
	os.environ["MKL_NUM_THREADS"] = max_threads
	os.environ["VECLIB_MAXIMUM_THREADS"] = max_threads
	os.environ["NUMEXPR_NUM_THREADS"] = max_threads
	os.environ["NUMEXPR_MAX_THREADS"] = max_threads
	os.environ["HF_HOME"] = "/data/hugginface/"

	main()