"""Generate answers with local models. Usage: python3 gen_model_answer.py --model-path lmsys/fastchat-t5-3b-v1.0 --model-id fastchat-t5-3b-v1.0 """ import argparse import json import os import random import time import shortuuid import torch from tqdm import tqdm from fastchat.llm_judge.common import load_questions, temperature_config from fastchat.model import load_model, get_conversation_template from fastchat.utils import str_to_torch_dtype def run_eval( model_path, model_id, question_file, question_begin, question_end, answer_file, max_new_token, num_choices, num_gpus_per_model, num_gpus_total, max_gpu_memory, dtype, revision, ): questions = load_questions(question_file, question_begin, question_end) # random shuffle the questions to balance the loading random.shuffle(questions) # Split the question file into `num_gpus` files assert num_gpus_total % num_gpus_per_model == 0 use_ray = num_gpus_total // num_gpus_per_model > 1 if use_ray: get_answers_func = ray.remote(num_gpus=num_gpus_per_model)( get_model_answers ).remote else: get_answers_func = get_model_answers chunk_size = len(questions) // (num_gpus_total // num_gpus_per_model) ans_handles = [] for i in range(0, len(questions), chunk_size): ans_handles.append( get_answers_func( model_path, model_id, questions[i : i + chunk_size], answer_file, max_new_token, num_choices, num_gpus_per_model, max_gpu_memory, dtype=dtype, revision=revision, ) ) if use_ray: ray.get(ans_handles) @torch.inference_mode() def get_model_answers( model_path, model_id, questions, answer_file, max_new_token, num_choices, num_gpus_per_model, max_gpu_memory, dtype, revision, ): model, tokenizer = load_model( model_path, revision=revision, device="cuda", num_gpus=num_gpus_per_model, max_gpu_memory=max_gpu_memory, dtype=dtype, load_8bit=False, cpu_offloading=False, debug=False, ) for question in tqdm(questions): if question["category"] in temperature_config: temperature = temperature_config[question["category"]] else: temperature = 0.7 choices = [] for i in range(num_choices): torch.manual_seed(i) conv = get_conversation_template(model_id) turns = [] for j in range(len(question["turns"])): qs = question["turns"][j] conv.append_message(conv.roles[0], qs) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() input_ids = tokenizer([prompt]).input_ids if temperature < 1e-4: do_sample = False else: do_sample = True # some models may error out when generating long outputs try: output_ids = model.generate( torch.as_tensor(input_ids).cuda(), do_sample=do_sample, temperature=temperature, max_new_tokens=max_new_token, ) if model.config.is_encoder_decoder: output_ids = output_ids[0] else: output_ids = output_ids[0][len(input_ids[0]) :] # be consistent with the template's stop_token_ids if conv.stop_token_ids: stop_token_ids_index = [ i for i, id in enumerate(output_ids) if id in conv.stop_token_ids ] if len(stop_token_ids_index) > 0: output_ids = output_ids[: stop_token_ids_index[0]] output = tokenizer.decode( output_ids, spaces_between_special_tokens=False, ) if conv.stop_str and isinstance(conv.stop_str, list): stop_str_indices = sorted( [ output.find(stop_str) for stop_str in conv.stop_str if output.find(stop_str) > 0 ] ) if len(stop_str_indices) > 0: output = output[: stop_str_indices[0]] elif conv.stop_str and output.find(conv.stop_str) > 0: output = output[: output.find(conv.stop_str)] for special_token in tokenizer.special_tokens_map.values(): if isinstance(special_token, list): for special_tok in special_token: output = output.replace(special_tok, "") else: output = output.replace(special_token, "") output = output.strip() if conv.name == "xgen" and output.startswith("Assistant:"): output = output.replace("Assistant:", "", 1).strip() except RuntimeError as e: print("ERROR question ID: ", question["question_id"]) output = "ERROR" conv.update_last_message(output) turns.append(output) choices.append({"index": i, "turns": turns}) # Dump answers os.makedirs(os.path.dirname(answer_file), exist_ok=True) with open(os.path.expanduser(answer_file), "a") as fout: ans_json = { "question_id": question["question_id"], "answer_id": shortuuid.uuid(), "model_id": model_id, "choices": choices, "tstamp": time.time(), } fout.write(json.dumps(ans_json) + "\n") def reorg_answer_file(answer_file): """Sort by question id and de-duplication""" answers = {} with open(answer_file, "r") as fin: for l in fin: qid = json.loads(l)["question_id"] answers[qid] = l qids = sorted(list(answers.keys())) with open(answer_file, "w") as fout: for qid in qids: fout.write(answers[qid]) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--model-path", type=str, required=True, help="The path to the weights. This can be a local folder or a Hugging Face repo ID.", ) parser.add_argument( "--model-id", type=str, required=True, help="A custom name for the model." ) parser.add_argument( "--bench-name", type=str, default="mt_bench", help="The name of the benchmark question set.", ) parser.add_argument( "--question-begin", type=int, help="A debug option. The begin index of questions.", ) parser.add_argument( "--question-end", type=int, help="A debug option. The end index of questions." ) parser.add_argument("--answer-file", type=str, help="The output answer file.") parser.add_argument( "--max-new-token", type=int, default=1024, help="The maximum number of new generated tokens.", ) parser.add_argument( "--num-choices", type=int, default=1, help="How many completion choices to generate.", ) parser.add_argument( "--num-gpus-per-model", type=int, default=1, help="The number of GPUs per model.", ) parser.add_argument( "--num-gpus-total", type=int, default=1, help="The total number of GPUs." ) parser.add_argument( "--max-gpu-memory", type=str, help="Maxmum GPU memory used for model weights per GPU.", ) parser.add_argument( "--dtype", type=str, choices=["float32", "float16", "bfloat16"], help="Override the default dtype. If not set, it will use float16 on GPU and float32 on CPU.", default=None, ) parser.add_argument( "--revision", type=str, default="main", help="The model revision to load.", ) args = parser.parse_args() if args.num_gpus_total // args.num_gpus_per_model > 1: import ray ray.init() question_file = f"data/{args.bench_name}/question.jsonl" if args.answer_file: answer_file = args.answer_file else: answer_file = f"data/{args.bench_name}/model_answer/{args.model_id}.jsonl" print(f"Output to {answer_file}") run_eval( model_path=args.model_path, model_id=args.model_id, question_file=question_file, question_begin=args.question_begin, question_end=args.question_end, answer_file=answer_file, max_new_token=args.max_new_token, num_choices=args.num_choices, num_gpus_per_model=args.num_gpus_per_model, num_gpus_total=args.num_gpus_total, max_gpu_memory=args.max_gpu_memory, dtype=str_to_torch_dtype(args.dtype), revision=args.revision, ) reorg_answer_file(answer_file)