import copy
import logging
import os

# from adapter.awq import create_awq_model
# from adapter.awq import quantize_awq_model
from datetime import datetime
from pathlib import Path

import pandas as pd
import torch
from auto_gptq import BaseQuantizeConfig as GPTQQuantConfig
from hqq.core.quantize import BaseQuantizeConfig as HQQQuantConfig
from transformers import BitsAndBytesConfig

from lm_quant_toolkit.adapter.autoawq import (
    create_autoawq_model,
    quantize_autoawq_model,
)
from lm_quant_toolkit.adapter.autogptq import (
    create_autogptq_model,
    quantize_autogptq_model,
)
from lm_quant_toolkit.adapter.bnb import create_bnb_model, quantize_bnb_model
from lm_quant_toolkit.adapter.fp16 import create_fp16_model
from lm_quant_toolkit.adapter.hqq import create_hqq_model, quantize_hqq_model
from lm_quant_toolkit.adapter.mxq import create_mxq_model, quantize_mxq_model
from lm_quant_toolkit.eval.common import (
    HQQ_CONFIGS,
    _dump_cuda_mem_snapshot,
    _reset_peak_memory_stats,
    cleanup,
    combine_metrics,
    get_memory_metrics,
    get_mxq_quant_meta_data_file,
    persist_progress,
    save_partial_metric,
)
from lm_quant_toolkit.eval.leaderboard import eval_llm_leaderboard
from lm_quant_toolkit.eval.perplexity import eval_ppls

ALL_MODELS = [
    "meta-llama/Llama-2-7b-hf",
    "meta-llama/Llama-2-13b-hf",
    "meta-llama/Meta-Llama-3-8B",
    "meta-llama/Llama-3.1-8B",
]

MXQ_CONFIGS = [
    (
        f"{bits:.2f}".replace(".", "_"),
        HQQQuantConfig(mixed=True, budget=bits, quant_scale=True),
    )
    for bits in [5.00, 4.75, 4.50, 4.25, 4.01, 3.76, 3.50, 3.00, 2.75, 2.48]
]

BNB_CONFIGS = [
    (
        "b4g64",
        BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype="float16",
            bnb_4bit_use_double_quant=True,
        ),
    ),
    (
        "b8g128",
        BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_4bit_compute_dtype="float16",
        ),
    ),
]


AUTOAWQ_CONFIGS = [
    ("b4g32", {"w_bit": 4, "q_group_size": 32, "zero_point": True, "version": "GEMM"}),
    ("b4g64", {"w_bit": 4, "q_group_size": 64, "zero_point": True, "version": "GEMM"}),
    (
        "b4g128",
        {"w_bit": 4, "q_group_size": 128, "zero_point": True, "version": "GEMM"},
    ),
    # 3-bit not supported by AutoAWQ right now
    # ("b3g64", {"w_bit": 3, "q_group_size": 64, "zero_point": True, 'version':'gemv_fast'}),
    # ("b3g128", {"w_bit": 3, "q_group_size": 128, "zero_point": True, 'version':'gemv_fast'}),
]

GPTQ_CONFIGS = [
    (
        "b8g32",
        GPTQQuantConfig(bits=8, group_size=32, damp_percent=0.01, desc_act=False),
    ),
    (
        "b8g64",
        GPTQQuantConfig(bits=8, group_size=64, damp_percent=0.01, desc_act=False),
    ),
    (
        "b8g128",
        GPTQQuantConfig(bits=8, group_size=128, damp_percent=0.01, desc_act=False),
    ),
    (
        "b4g32",
        GPTQQuantConfig(bits=4, group_size=32, damp_percent=0.01, desc_act=False),
    ),
    (
        "b4g64",
        GPTQQuantConfig(bits=4, group_size=64, damp_percent=0.01, desc_act=False),
    ),
    (
        "b4g128",
        GPTQQuantConfig(bits=4, group_size=128, damp_percent=0.01, desc_act=False),
    ),
    (
        "b3g32",
        GPTQQuantConfig(bits=3, group_size=32, damp_percent=0.01, desc_act=False),
    ),
    (
        "b3g64",
        GPTQQuantConfig(bits=3, group_size=64, damp_percent=0.01, desc_act=False),
    ),
    (
        "b3g128",
        GPTQQuantConfig(bits=3, group_size=128, damp_percent=0.01, desc_act=False),
    ),
]


def gen_experiment_items(models, tasks):
    dikts = []
    for algo, spec in tasks.items():
        configs = spec["configs"]
        for config in configs:
            for model_id in models:
                dikts.append(
                    {
                        "model": model_id,
                        "cfg": config[0],
                        "task_type": spec["type"],
                        "algo": algo,
                    }
                )
    return pd.DataFrame(dikts)


def _setup_fn(algo, spec):
    match algo:
        case "fp16":
            spec["create_fn"] = create_fp16_model
            spec["quantize_fn"] = None
        case "awq":
            spec["create_fn"] = create_autoawq_model
            spec["quantize_fn"] = quantize_autoawq_model
        case "gptq":
            spec["create_fn"] = create_autogptq_model
            spec["quantize_fn"] = quantize_autogptq_model
        case "hqq":
            spec["create_fn"] = create_hqq_model
            spec["quantize_fn"] = quantize_hqq_model
        case "bnb":
            spec["create_fn"] = create_bnb_model
            spec["quantize_fn"] = quantize_bnb_model
        case "mxq":
            spec["create_fn"] = create_mxq_model
            spec["quantize_fn"] = quantize_mxq_model
        case _:
            raise ValueError(f"Invalid algo: {algo}")


def do_expermient(
    experiment_name,
    models,
    tasks,
    quant_dir="snapshots",
    result_dir="results",
    log_dir="logs",
    track_cuda_memory=False,
    **kwargs,
):
    df_all = gen_experiment_items(models, tasks)
    progress_path = os.path.join(result_dir, experiment_name, "progress.csv")
    if Path(progress_path).exists():
        df_saved = pd.read_csv(progress_path)
        df_all = df_all.merge(
            df_saved, how="left", on=["model", "cfg", "task_type", "algo"]
        )
        # filter already processed repos, equivalent to SQL is null
        df_todo = df_all.query("status != status or status != 1")
    else:
        df_all["status"] = 0
        df_all["completion_time"] = ""
        df_todo = df_all
    print("*" * 72)
    print("Sub-task list:")
    print(df_all)
    cnt_todo, cnt_tot = len(df_todo), len(df_all)
    print(f"Todo:{cnt_todo}, Done: {cnt_tot - cnt_todo}, Total: {cnt_tot}")
    if cnt_todo == 0:
        print("Tasks completed!")
    print("*" * 72)
    if cnt_todo == 0:
        return

    df_todo = df_todo.sort_values(by=["model", "cfg"], ascending=False)
    for idx, row in df_todo.iterrows():
        model_id = row["model"]
        algo = row["algo"]
        task_type = row["task_type"]
        cfg = row["cfg"]
        spec = tasks[algo]
        _setup_fn(algo, spec)
        config = [c for c in spec["configs"] if c[0] == cfg][0]
        quant_fn = spec["quantize_fn"]
        metric = _init_metrics(model_id, algo, config)
        print("*" * 72)
        if task_type == "quant":
            print(f"Quantizing {algo} on {model_id} w/ config: {cfg}...")
        elif task_type == "eval_ppl":
            print(f"Evaluating {algo} PPL on {model_id} w/ config: {cfg}...")
        elif task_type == "eval_leaderboard":
            print(
                f"Evaluating {algo} LLM Leaderboard benchmarks on {model_id} w/ config: {cfg}..."
            )
        else:
            print(
                f"Evaluating {algo} model storage metrics on {model_id} w/ config: {cfg}..."
            )
        print("*" * 72)

        if track_cuda_memory:
            torch.cuda.memory._record_memory_history()
        _reset_peak_memory_stats()
        if task_type != "eval_leaderboard":
            create_fn = spec["create_fn"]
            model, tokenizer, quantized, model_file_size = create_fn(
                model_id, config[1], cfg, quant_fn is not None, quant_dir
            )

            if not quantized and quant_fn:
                # avoid interventions between models
                quant_config = copy.deepcopy(config[1])
                if algo == "mxq":
                    ok, metric_fp = get_mxq_quant_meta_data_file(model_id)
                    if not ok:
                        print(
                            f"Quantization meta data file: {metric_fp} doesn't exists!"
                        )
                        return
                    quant_config["quant_metrics_file"] = metric_fp
                    quant_config["weight_algo"] = kwargs.get("weight_algo", None)
                    quant_config["boost_layers"] = kwargs.get("boost_layers", None)
                    quant_config["decline_layers"] = kwargs.get("decline_layers", None)
                    quant_config["boost_stop"] = kwargs.get("boost_stop", None)
                    quant_config["decline_stop"] = kwargs.get("decline_stop", None)
                    quant_config["ablation"] = kwargs.get("ablation", None)
                    quant_config["top_m_layer"] = kwargs.get("top_m_layer", None)
                    quant_config["factor"] = kwargs.get("factor", None)
                model, duration, model_file_size = quant_fn(
                    model,
                    tokenizer,
                    quant_config,
                    model_id,
                    cfg,
                    quant_dir,
                )
                metric["quant_duration"] = duration
            if task_type == "eval_model_storage":
                allot, reserved = get_memory_metrics()
                metric["load_mem_allot"] = allot
                metric["load_mem_reserved"] = reserved
                metric["model_storage_size"] = model_file_size

            elif task_type == "eval_ppl":
                # Evaluate the quantized model
                metric = eval_ppls(model, tokenizer, metric)
                metric["ppl_mem_allot"], metric["ppl_mem_reserved"] = (
                    get_memory_metrics()
                )
            if track_cuda_memory:
                _dump_cuda_mem_snapshot(experiment_name, model_id, algo, result_dir)
            cleanup(model)
        else:
            metric = eval_llm_leaderboard(
                experiment_name,
                model_id,
                algo,
                cfg,
                quant_fn is not None,
                metric,
                quant_dir,
                result_dir,
            )
            metric["leaderboard_mem_allot"], metric["leaderboard_mem_reserved"] = (
                get_memory_metrics()
            )
            if track_cuda_memory:
                _dump_cuda_mem_snapshot(experiment_name, model_id, algo, result_dir)
        save_partial_metric(experiment_name, algo, model_id, cfg, metric, result_dir)
        df_all.loc[
            (df_all["model"] == model_id)
            & (df_all["cfg"] == cfg)
            & (df_all["algo"] == algo)
            & (df_all["task_type"] == task_type),
            ["status", "completion_time"],
        ] = 1, datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        persist_progress(df_all, progress_path)
    # combine metrics
    combine_metrics(experiment_name, result_dir)


def _init_metrics(model_id, algo, config):
    return {
        "model": model_id.split("/")[1],
        "algo": algo,
        "config": config[0],
        "config_detail": str(config[1]).replace("\n", ""),
        "quant_duration": 0,
        "model_storage_size": 0,
        "load_mem_allot": 0,
        "load_mem_reserved": 0,
        "ppl_mem_allot": 0,
        "ppl_mem_reserved": 0,
        "leaderboard_mem_allot": 0,
        "leaderboard_mem_reserved": 0,
        "quant_duration": 0,
        "ppl_wikitext": 0,
        "ppl_c4": 0,
        "duration_wikitext": 0,
        "duration_c4": 0,
        "duration_leaderboard": 0,
        "ifeval": 0,
        "bbh": 0,
        "mathlevel5": 0,
        "gpqa": 0,
        "musr": 0,
        "mmlupro": 0,
    }


def do_expermient_fdata(
    experiment_name,
    models,
    tasks,
    track_cuda_memory=False,
):
    do_expermient(
        experiment_name,
        models,
        tasks,
        quant_dir="/fdata/llm/mxq/snapshots",
        result_dir="/fdata/llm/mxq/results",
        log_dir="/fdata/llm/mxq/logs",
        track_cuda_memory=track_cuda_memory,
    )


########################################################################
#  Quantization experiments
########################################################################


def experiment_quant_hqq():
    models = ALL_MODELS
    tasks = {
        "hqq": {
            "type": "quant",
            "configs": HQQ_CONFIGS,
        },
    }
    do_expermient_fdata(
        "quant_hqq",
        models,
        tasks,
    )


def experiment_quant_mxq():
    models = ALL_MODELS
    type = "quant"
    algo = "mxq"
    tasks = {
        algo: {
            "type": type,
            "configs": MXQ_CONFIGS,
        },
    }
    do_expermient_fdata(f"{type}_{algo}_mxq", models, tasks)


def experiment_quant_awq():
    # models = [ALL_MODELS[0], ALL_MODELS[2]]
    models = [ALL_MODELS[1]]
    type = "quant"
    algo = "awq"
    tasks = {
        algo: {
            "type": type,
            "configs": AUTOAWQ_CONFIGS,
        },
    }
    do_expermient_fdata(f"{type}_{algo}", models, tasks)


def experiment_quant_gptq():
    models = ALL_MODELS
    type = "quant"
    algo = "gptq"
    tasks = {
        algo: {
            "type": type,
            "configs": GPTQ_CONFIGS,
        },
    }
    do_expermient_fdata(f"{type}_{algo}", models, tasks)


def experiment_quantize_405B():
    models = [
        "meta-llama/Meta-Llama-3.1-405B-Instruct",
    ]

    tasks = {
        "hqq": {
            "type": "quant",
            "configs": HQQ_CONFIGS[1:2],
        },
    }
    do_expermient(
        "quant_hqq_405B",
        models,
        tasks,
        quant_dir="/data/gqq-eval/snapshots/",
    )


########################################################################
#  Perplexity evaluation experiments
########################################################################


def experiment_ppl_eval_fp16():
    models = ALL_MODELS
    type = "eval_ppl"
    algo = "fp16"
    tasks = {
        algo: {
            "type": type,
            "configs": [
                ("base", {}),
            ],
        },
    }
    do_expermient_fdata(f"{type}_{algo}", models, tasks)


def experiment_ppl_eval_awq():
    models = [ALL_MODELS[0], ALL_MODELS[2]]
    type = "eval_ppl"
    algo = "awq"
    tasks = {
        algo: {
            "type": type,
            "configs": AUTOAWQ_CONFIGS[1:2],
        },
    }
    do_expermient_fdata(f"{type}_{algo}", models, tasks)


def experiment_ppl_eval_gptq():
    models = ALL_MODELS
    type = "eval_ppl"
    algo = "gptq"
    tasks = {
        algo: {
            "type": type,
            "configs": GPTQ_CONFIGS,
        },
    }
    do_expermient_fdata(f"{type}_{algo}", models, tasks)


def experiment_ppl_eval_hqq():
    models = ALL_MODELS
    type = "eval_ppl"
    algo = "hqq"
    tasks = {
        algo: {
            "type": type,
            "configs": HQQ_CONFIGS,
        },
    }
    do_expermient_fdata(f"{type}_{algo}", models, tasks)


########################################################################
#  Open LLM Leaderboard evaluation experiments
########################################################################


def experiment_llm_leaderboard_fp16():
    models = [ALL_MODELS[0], ALL_MODELS[2]]
    type = "eval_leaderboard"
    algo = "fp16"
    tasks = {
        algo: {
            "type": type,
            "configs": [
                ("base", {}),
            ],
        },
    }
    do_expermient_fdata(f"{type}_{algo}", models, tasks)


def experiment_llm_leaderboard_autogptq():
    models = ALL_MODELS
    type = "eval_leaderboard"
    algo = "gptq"
    tasks = {
        algo: {
            "type": type,
            "configs": GPTQ_CONFIGS,
        },
    }
    do_expermient_fdata(f"{type}_{algo}", models, tasks)


def experiment_llm_leaderboard_hqq():
    models = ALL_MODELS
    type = "eval_leaderboard"
    algo = "hqq"
    tasks = {
        algo: {
            "type": type,
            "configs": HQQ_CONFIGS,
        },
    }
    do_expermient_fdata(f"{type}_{algo}", models, tasks)


def experiment_llm_leaderboard_mxq():
    models = ALL_MODELS
    type = "eval_leaderboard"
    algo = "mxq"
    tasks = {
        algo: {
            "type": type,
            "configs": MXQ_CONFIGS,
        },
    }
    do_expermient_fdata(f"{type}_{algo}", models, tasks)


def experiment_llm_leaderboard_autoawq():
    models = ALL_MODELS[0:1]
    type = "eval_leaderboard"
    algo = "awq"
    tasks = {
        algo: {
            "type": type,
            "configs": AUTOAWQ_CONFIGS[0:2],
        },
    }
    do_expermient_fdata(f"{type}_{algo}", models, tasks)


########################################################################
#  Mixed Quant Eval experiments
########################################################################


def experiment_quant_ppl_eval_mxq_comprise():
    models = ALL_MODELS
    equiv_mxq_configs = []
    nbits = [4.06, 4.10, 4.15, 4.19, 4.24, 4.28, 4.33]
    for bits in nbits:
        cfg_name = f"mxq-{str(bits).replace('.', '_')}"
        equiv_mxq_configs.append(
            (cfg_name, HQQQuantConfig(mixed=True, budget=bits, quant_scale=True))
        )
    quant_tasks = {
        "hqq": {
            "type": "quant",
            "configs": equiv_mxq_configs,
        },
    }
    ppl_tasks = {
        "hqq": {
            "type": "eval_ppl",
            "configs": equiv_mxq_configs,
        },
    }
    do_expermient_fdata("quant_mxq_compromise", models, quant_tasks)
    do_expermient_fdata("eval_mxq_compromise", models, ppl_tasks)


########################################################################
#  Misc experiments
########################################################################


def experiment_fp16_llama3_8B_OOM():
    models = ALL_MODELS[-1:]
    type = "eval_ppl"
    algo = "fp16"
    tasks = {
        algo: {
            "type": type,
            "configs": [
                ("base", {}),
            ],
        },
    }
    do_expermient_fdata(
        f"{type}_llama3_8B_OOM_{algo}",
        models,
        tasks,
        track_cuda_memory=True,
    )


def experiment_fp16_vs_hqq_eval_gpu_mem():
    models = ALL_MODELS[-1:]
    type = "eval_ppl"
    algo = "fp16"
    tasks = {
        algo: {
            "type": type,
            "configs": [
                ("base", {}),
            ],
        },
    }
    do_expermient_fdata("experiment_fp16_vs_hqq_eval_gpu_mem", models, tasks)
    algo = "hqq"
    tasks = {
        algo: {
            "type": type,
            "configs": HQQ_CONFIGS[1:2],
        },
    }
    do_expermient_fdata("experiment_fp16_vs_hqq_eval_gpu_mem", models, tasks)


def experiment_eval_model_storage():
    models = ALL_MODELS
    type = "eval_model_storage"
    tasks = {
        "fp16": {
            "type": type,
            "configs": [
                ("base", {}),
            ],
        },
        "mxq": {
            "type": type,
            "configs": MXQ_CONFIGS,
        },
        "hqq": {
            "type": type,
            "configs": HQQ_CONFIGS,
        },
        "awq": {
            "type": type,
            "configs": AUTOAWQ_CONFIGS,
        },
        "gptq": {"type": type, "configs": GPTQ_CONFIGS},
    }
    for i in range(5):
        do_expermient_fdata(f"eval_model_storge_{i}", models, tasks)


def experiment_eval_ppl_all():
    models = ALL_MODELS
    type = "eval_ppl"
    tasks = {
        "fp16": {
            "type": type,
            "configs": [
                ("base", {}),
            ],
        },
        "mxq": {
            "type": type,
            "configs": MXQ_CONFIGS,
        },
        "hqq": {
            "type": type,
            "configs": HQQ_CONFIGS,
        },
        "awq": {
            "type": type,
            "configs": AUTOAWQ_CONFIGS,
        },
        "gptq": {"type": type, "configs": GPTQ_CONFIGS},
    }
    do_expermient_fdata("experiment_eval_ppl_all", models, tasks)


def experiment_debug_quant_hqq():
    models = [ALL_MODELS[1]]
    type = "eval_model_storage"
    algo = "hqq"
    tasks = {
        algo: {
            "type": type,
            "configs": HQQ_CONFIGS[1:2],
        },
    }

    do_expermient(
        f"debug_{type}_{algo}",
        models,
        tasks,
        quant_dir="/fdata/llm/mxq/snapshots-debug",
        result_dir="/fdata/llm/mxq/results",
    )


def main():
    logging.basicConfig(
        format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
        level=logging.INFO,
        datefmt="%Y-%m-%d %H:%M:%S",
    )

    # experiment_llm_leaderboard_autogptq()
    # experiment_llm_leaderboard_fp16()
    # experiment_llm_leaderboard_hqq()
    # experiment_llm_leaderboard_autoawq()
    # experiment_quant_hqq()
    # experiment_quant_mxq()
    # experiment_quant_awq()
    # experiment_quant_gptq()
    # experiment_ppl_eval_fp16()
    # experiment_ppl_eval_hqq()
    # experiment_ppl_eval_gptq()
    # experiment_ppl_eval_awq()
    # experiment_fp16_llama3_8B_OOM()
    # experiment_fp16_vs_hqq_eval_gpu_mem()
    # experiment_debug_quant_hqq()
    # experiment_eval_model_storage()
    # experiment_eval_ppl_all()
    experiment_debug_quant_hqq()


if __name__ == "__main__":
    # os.environ['HF_DATASETS_OFFLINE'] = '1'

    max_threads = str(min(8, os.cpu_count()))
    os.environ["OMP_NUM_THREADS"] = max_threads
    os.environ["OPENBLAS_NUM_THREADS"] = max_threads
    os.environ["MKL_NUM_THREADS"] = max_threads
    os.environ["VECLIB_MAXIMUM_THREADS"] = max_threads
    os.environ["NUMEXPR_NUM_THREADS"] = max_threads
    os.environ["NUMEXPR_MAX_THREADS"] = max_threads
    os.environ["HF_HOME"] = "/data/hugginface/"

    main()