openbmb
/

minicpm_2b_int4

+from argparse import ArgumentParser
+import datasets
+import torch
+from transformers import AutoTokenizer
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+from auto_gptq.eval_tasks import LanguageModelingTask
+DATASET = "/root/ld/ld_project/AutoGPTQ/examples/quantization/dataset/raw_datasets"
+WITH_INPUT_TEMPLATE = "Instruction:\n{instruction}\n\nInput:\n{input}\n\nOutput:\n"
+WITHOUT_INPUT_TEMPLATE = "<s><用户>{instruction}<AI>Output:"
+def ds_refactor_fn(samples):
+    instruction_data = samples["instruction"]
+    input_data = samples["input"]
+    output_data = samples["output"]
+    new_samples = {"prompt": [], "output": []}
+    for instruction_txt, input_txt, output_txt in zip(instruction_data, input_data, output_data):
+        if input_txt:
+            prompt = WITH_INPUT_TEMPLATE.format(instruction=instruction_txt, input=input_txt)
+        else:
+            prompt = WITHOUT_INPUT_TEMPLATE.format(instruction=instruction_txt)
+        new_samples["prompt"].append(prompt)
+        new_samples["output"].append(output_txt)
+    return new_samples
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--base_model_dir", type=str,default='/root/ld/ld_model_pretrained/miniCPM-bf16')
+    parser.add_argument("--quantized_model_dir", type=str,default='/root/ld/ld_project/AutoGPTQ/examples/quantization/minicpm_2b_4bit')
+    parser.add_argument(
+        "--num_samples",
+        type=int,
+        default=200,
+        help="how many samples will be sampled to evaluation",
+    )
+    parser.add_argument("--sample_max_len", type=int, default=1024, help="max tokens for each sample")
+    parser.add_argument("--block_max_len", type=int, default=2048, help="max tokens for each data block")
+    parser.add_argument("--use_triton", action="store_true")
+    args = parser.parse_args()
+    tokenizer = AutoTokenizer.from_pretrained(args.base_model_dir)
+    model = AutoGPTQForCausalLM.from_pretrained(args.base_model_dir, BaseQuantizeConfig(),trust_remote_code=True)
+    model.to("cuda:0")
+    task = LanguageModelingTask(
+        model=model,
+        tokenizer=tokenizer,
+        data_name_or_path=DATASET,
+        prompt_col_name="prompt",
+        label_col_name="output",
+        **{
+            "num_samples": args.num_samples,  # how many samples will be sampled to evaluation
+            "sample_max_len": args.sample_max_len,  # max tokens for each sample
+            "block_max_len": args.block_max_len,  # max tokens for each data block
+            "load_fn": datasets.load_dataset,  # function to load dataset
+            "preprocess_fn": ds_refactor_fn,  # function to preprocess dataset
+            "truncate_prompt": False,  # truncate label when sample's length exceed sample_max_len
+        },
+    )
+    print(f"eval result for base model: {task.run()}")
+    task.model = None
+    model.cpu()
+    del model
+    torch.cuda.empty_cache()
+    model = AutoGPTQForCausalLM.from_quantized(args.quantized_model_dir, device="cuda:0", use_triton=args.use_triton,trust_remote_code=True)
+    task.model = model
+    task.device = model.device
+    print(f"eval result for quantized model: {task.run()}")
+if __name__ == "__main__":
+    main()