|
from argparse import ArgumentParser |
|
|
|
import datasets |
|
import torch |
|
from transformers import AutoTokenizer |
|
|
|
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig |
|
from auto_gptq.eval_tasks import LanguageModelingTask |
|
|
|
|
|
DATASET = "/root/ld/ld_project/AutoGPTQ/examples/quantization/dataset/raw_datasets" |
|
WITH_INPUT_TEMPLATE = "Instruction:\n{instruction}\n\nInput:\n{input}\n\nOutput:\n" |
|
WITHOUT_INPUT_TEMPLATE = "<s><用户>{instruction}<AI>Output:" |
|
|
|
|
|
def ds_refactor_fn(samples): |
|
instruction_data = samples["instruction"] |
|
input_data = samples["input"] |
|
output_data = samples["output"] |
|
|
|
new_samples = {"prompt": [], "output": []} |
|
for instruction_txt, input_txt, output_txt in zip(instruction_data, input_data, output_data): |
|
if input_txt: |
|
prompt = WITH_INPUT_TEMPLATE.format(instruction=instruction_txt, input=input_txt) |
|
else: |
|
prompt = WITHOUT_INPUT_TEMPLATE.format(instruction=instruction_txt) |
|
new_samples["prompt"].append(prompt) |
|
new_samples["output"].append(output_txt) |
|
|
|
return new_samples |
|
|
|
|
|
def main(): |
|
parser = ArgumentParser() |
|
parser.add_argument("--base_model_dir", type=str,default='/root/ld/ld_model_pretrained/miniCPM-bf16') |
|
parser.add_argument("--quantized_model_dir", type=str,default='/root/ld/ld_project/AutoGPTQ/examples/quantization/minicpm_2b_4bit') |
|
parser.add_argument( |
|
"--num_samples", |
|
type=int, |
|
default=200, |
|
help="how many samples will be sampled to evaluation", |
|
) |
|
parser.add_argument("--sample_max_len", type=int, default=1024, help="max tokens for each sample") |
|
parser.add_argument("--block_max_len", type=int, default=2048, help="max tokens for each data block") |
|
parser.add_argument("--use_triton", action="store_true") |
|
args = parser.parse_args() |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(args.base_model_dir) |
|
|
|
model = AutoGPTQForCausalLM.from_pretrained(args.base_model_dir, BaseQuantizeConfig(),trust_remote_code=True) |
|
model.to("cuda:0") |
|
|
|
task = LanguageModelingTask( |
|
model=model, |
|
tokenizer=tokenizer, |
|
data_name_or_path=DATASET, |
|
prompt_col_name="prompt", |
|
label_col_name="output", |
|
|
|
**{ |
|
"num_samples": args.num_samples, |
|
"sample_max_len": args.sample_max_len, |
|
"block_max_len": args.block_max_len, |
|
"load_fn": datasets.load_dataset, |
|
"preprocess_fn": ds_refactor_fn, |
|
"truncate_prompt": False, |
|
}, |
|
) |
|
|
|
print(f"eval result for base model: {task.run()}") |
|
task.model = None |
|
model.cpu() |
|
del model |
|
torch.cuda.empty_cache() |
|
|
|
model = AutoGPTQForCausalLM.from_quantized(args.quantized_model_dir, device="cuda:0", use_triton=args.use_triton,trust_remote_code=True) |
|
task.model = model |
|
task.device = model.device |
|
print(f"eval result for quantized model: {task.run()}") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|