Spaces:
No application file
No application file
"""Generate answers with local models. | |
Usage: | |
python3 gen_model_answer.py --model-path lmsys/fastchat-t5-3b-v1.0 --model-id fastchat-t5-3b-v1.0 | |
""" | |
import argparse | |
import json | |
import os | |
import random | |
import time | |
import shortuuid | |
import torch | |
from tqdm import tqdm | |
from fastchat.llm_judge.common import load_questions, temperature_config | |
from fastchat.model import load_model, get_conversation_template | |
from fastchat.utils import str_to_torch_dtype | |
def run_eval( | |
model_path, | |
model_id, | |
question_file, | |
question_begin, | |
question_end, | |
answer_file, | |
max_new_token, | |
num_choices, | |
num_gpus_per_model, | |
num_gpus_total, | |
max_gpu_memory, | |
dtype, | |
revision, | |
): | |
questions = load_questions(question_file, question_begin, question_end) | |
# random shuffle the questions to balance the loading | |
random.shuffle(questions) | |
# Split the question file into `num_gpus` files | |
assert num_gpus_total % num_gpus_per_model == 0 | |
use_ray = num_gpus_total // num_gpus_per_model > 1 | |
if use_ray: | |
get_answers_func = ray.remote(num_gpus=num_gpus_per_model)( | |
get_model_answers | |
).remote | |
else: | |
get_answers_func = get_model_answers | |
chunk_size = len(questions) // (num_gpus_total // num_gpus_per_model) | |
ans_handles = [] | |
for i in range(0, len(questions), chunk_size): | |
ans_handles.append( | |
get_answers_func( | |
model_path, | |
model_id, | |
questions[i : i + chunk_size], | |
answer_file, | |
max_new_token, | |
num_choices, | |
num_gpus_per_model, | |
max_gpu_memory, | |
dtype=dtype, | |
revision=revision, | |
) | |
) | |
if use_ray: | |
ray.get(ans_handles) | |
def get_model_answers( | |
model_path, | |
model_id, | |
questions, | |
answer_file, | |
max_new_token, | |
num_choices, | |
num_gpus_per_model, | |
max_gpu_memory, | |
dtype, | |
revision, | |
): | |
model, tokenizer = load_model( | |
model_path, | |
revision=revision, | |
device="cuda", | |
num_gpus=num_gpus_per_model, | |
max_gpu_memory=max_gpu_memory, | |
dtype=dtype, | |
load_8bit=False, | |
cpu_offloading=False, | |
debug=False, | |
) | |
for question in tqdm(questions): | |
if question["category"] in temperature_config: | |
temperature = temperature_config[question["category"]] | |
else: | |
temperature = 0.7 | |
choices = [] | |
for i in range(num_choices): | |
torch.manual_seed(i) | |
conv = get_conversation_template(model_id) | |
turns = [] | |
for j in range(len(question["turns"])): | |
qs = question["turns"][j] | |
conv.append_message(conv.roles[0], qs) | |
conv.append_message(conv.roles[1], None) | |
prompt = conv.get_prompt() | |
input_ids = tokenizer([prompt]).input_ids | |
if temperature < 1e-4: | |
do_sample = False | |
else: | |
do_sample = True | |
# some models may error out when generating long outputs | |
try: | |
output_ids = model.generate( | |
torch.as_tensor(input_ids).cuda(), | |
do_sample=do_sample, | |
temperature=temperature, | |
max_new_tokens=max_new_token, | |
) | |
if model.config.is_encoder_decoder: | |
output_ids = output_ids[0] | |
else: | |
output_ids = output_ids[0][len(input_ids[0]) :] | |
# be consistent with the template's stop_token_ids | |
if conv.stop_token_ids: | |
stop_token_ids_index = [ | |
i | |
for i, id in enumerate(output_ids) | |
if id in conv.stop_token_ids | |
] | |
if len(stop_token_ids_index) > 0: | |
output_ids = output_ids[: stop_token_ids_index[0]] | |
output = tokenizer.decode( | |
output_ids, | |
spaces_between_special_tokens=False, | |
) | |
if conv.stop_str and isinstance(conv.stop_str, list): | |
stop_str_indices = sorted( | |
[ | |
output.find(stop_str) | |
for stop_str in conv.stop_str | |
if output.find(stop_str) > 0 | |
] | |
) | |
if len(stop_str_indices) > 0: | |
output = output[: stop_str_indices[0]] | |
elif conv.stop_str and output.find(conv.stop_str) > 0: | |
output = output[: output.find(conv.stop_str)] | |
for special_token in tokenizer.special_tokens_map.values(): | |
if isinstance(special_token, list): | |
for special_tok in special_token: | |
output = output.replace(special_tok, "") | |
else: | |
output = output.replace(special_token, "") | |
output = output.strip() | |
if conv.name == "xgen" and output.startswith("Assistant:"): | |
output = output.replace("Assistant:", "", 1).strip() | |
except RuntimeError as e: | |
print("ERROR question ID: ", question["question_id"]) | |
output = "ERROR" | |
conv.update_last_message(output) | |
turns.append(output) | |
choices.append({"index": i, "turns": turns}) | |
# Dump answers | |
os.makedirs(os.path.dirname(answer_file), exist_ok=True) | |
with open(os.path.expanduser(answer_file), "a") as fout: | |
ans_json = { | |
"question_id": question["question_id"], | |
"answer_id": shortuuid.uuid(), | |
"model_id": model_id, | |
"choices": choices, | |
"tstamp": time.time(), | |
} | |
fout.write(json.dumps(ans_json) + "\n") | |
def reorg_answer_file(answer_file): | |
"""Sort by question id and de-duplication""" | |
answers = {} | |
with open(answer_file, "r") as fin: | |
for l in fin: | |
qid = json.loads(l)["question_id"] | |
answers[qid] = l | |
qids = sorted(list(answers.keys())) | |
with open(answer_file, "w") as fout: | |
for qid in qids: | |
fout.write(answers[qid]) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"--model-path", | |
type=str, | |
required=True, | |
help="The path to the weights. This can be a local folder or a Hugging Face repo ID.", | |
) | |
parser.add_argument( | |
"--model-id", type=str, required=True, help="A custom name for the model." | |
) | |
parser.add_argument( | |
"--bench-name", | |
type=str, | |
default="mt_bench", | |
help="The name of the benchmark question set.", | |
) | |
parser.add_argument( | |
"--question-begin", | |
type=int, | |
help="A debug option. The begin index of questions.", | |
) | |
parser.add_argument( | |
"--question-end", type=int, help="A debug option. The end index of questions." | |
) | |
parser.add_argument("--answer-file", type=str, help="The output answer file.") | |
parser.add_argument( | |
"--max-new-token", | |
type=int, | |
default=1024, | |
help="The maximum number of new generated tokens.", | |
) | |
parser.add_argument( | |
"--num-choices", | |
type=int, | |
default=1, | |
help="How many completion choices to generate.", | |
) | |
parser.add_argument( | |
"--num-gpus-per-model", | |
type=int, | |
default=1, | |
help="The number of GPUs per model.", | |
) | |
parser.add_argument( | |
"--num-gpus-total", type=int, default=1, help="The total number of GPUs." | |
) | |
parser.add_argument( | |
"--max-gpu-memory", | |
type=str, | |
help="Maxmum GPU memory used for model weights per GPU.", | |
) | |
parser.add_argument( | |
"--dtype", | |
type=str, | |
choices=["float32", "float16", "bfloat16"], | |
help="Override the default dtype. If not set, it will use float16 on GPU and float32 on CPU.", | |
default=None, | |
) | |
parser.add_argument( | |
"--revision", | |
type=str, | |
default="main", | |
help="The model revision to load.", | |
) | |
args = parser.parse_args() | |
if args.num_gpus_total // args.num_gpus_per_model > 1: | |
import ray | |
ray.init() | |
question_file = f"data/{args.bench_name}/question.jsonl" | |
if args.answer_file: | |
answer_file = args.answer_file | |
else: | |
answer_file = f"data/{args.bench_name}/model_answer/{args.model_id}.jsonl" | |
print(f"Output to {answer_file}") | |
run_eval( | |
model_path=args.model_path, | |
model_id=args.model_id, | |
question_file=question_file, | |
question_begin=args.question_begin, | |
question_end=args.question_end, | |
answer_file=answer_file, | |
max_new_token=args.max_new_token, | |
num_choices=args.num_choices, | |
num_gpus_per_model=args.num_gpus_per_model, | |
num_gpus_total=args.num_gpus_total, | |
max_gpu_memory=args.max_gpu_memory, | |
dtype=str_to_torch_dtype(args.dtype), | |
revision=args.revision, | |
) | |
reorg_answer_file(answer_file) | |