blue-tundra-42's picture
Upload UNO Scorer (initial version)
f1f682e verified
import os
import sys
import argparse
import json
from tqdm import tqdm
import asyncio
from models import get_model, VLLMClient
from benchmarks import get_dataset
def setup_arg_parser():
parser = argparse.ArgumentParser(description="Run evaluation on a given model and dataset.")
parser.add_argument("--model_name", type=str, required=True, help="Registered name of the model type (e.g., 'Qwen-2.5-Omni-7B').")
parser.add_argument("--model_path", type=str, default="", help="Path to the inference model.")
parser.add_argument("--model_api_url", type=str, default="", help="API url for the model.")
parser.add_argument("--system_prompt", type=str, default="", help="System prompt for the model.")
parser.add_argument("--batch_size", type=int, default=1, help="Batch size for model generation.")
parser.add_argument("--save_batch_size", type=int, default=128, help="Batch size for saving results.")
parser.add_argument("--dataset_name", type=str, required=True, help="Registered name of the dataset (e.g., 'UNO-Bench').")
parser.add_argument("--subset_name", type=str, default="", help="Subset name of the dataset.")
parser.add_argument("--dataset_local_dir", type=str, default="", help="Local path to the dataset.")
parser.add_argument("--hf_cache_dir", type=str, default="~/.cache/huggingface/hub", help="Hugging Face cache directory.")
parser.add_argument("--output_dir", type=str, default="./eval_results", help="Directory to save evaluation results.")
parser.add_argument("--exp_marking", type=str, default="", help="Experiment marking.")
parser.add_argument("--scorer_api_url", type=str, default="", help="The score model API url.")
parser.add_argument("--scorer_model_path", type=str, default="", help="The scorer model path.")
parser.add_argument("--mode", choices=["inference", "scoring"], default="inference")
return parser.parse_args()
def main():
args = setup_arg_parser()
print("Evaluation starting with the following configuration:")
print(json.dumps(vars(args), indent=2))
if os.path.exists(args.output_dir) is False:
os.makedirs(args.output_dir)
save_file_path = os.path.join(args.output_dir, f"{args.model_name}{args.exp_marking}:{args.dataset_name}.json")
# 1. Initialize dataset and prepare evaluation records
try:
dataset_handler = get_dataset(args.dataset_name)
dataset_kwargs = {}
if args.dataset_local_dir:
dataset_kwargs['local_dir'] = args.dataset_local_dir
if args.hf_cache_dir:
dataset_kwargs['hf_cache_dir'] = args.hf_cache_dir
if args.subset_name:
dataset_kwargs['subset_name'] = args.subset_name
dataset_handler.load_and_prepare(**dataset_kwargs)
if os.path.exists(save_file_path):
dataset_handler.load_results(save_file_path)
except Exception as e:
print(f"Error preparing dataset: {e}")
return
not_processed_records = [record for record in dataset_handler.evaluation_records
if record.request_status != 'success' or record.response is None]
# 2. Load model and generate evaluation responses
if args.mode == "inference" and len(not_processed_records)>0:
try:
model_kwargs = {}
if args.model_api_url != "":
model_kwargs['api_url'] = args.model_api_url
if args.system_prompt != "":
model_kwargs['system_prompt'] = args.system_prompt
model = get_model(args.model_name, args.model_path, **model_kwargs)
model.load_model()
except (ValueError, ImportError) as e:
print(f"Error initializing model: {e}")
return
batch_size = args.batch_size
if batch_size > 1:
# Batch generation
for batch_idx in tqdm(range(0, len(not_processed_records), batch_size),
desc=f"Evaluating {args.model_name} on {args.dataset_name}",
dynamic_ncols=True):
batch_records = not_processed_records[batch_idx:batch_idx + batch_size]
try:
messages = [record.message for record in batch_records]
responses = asyncio.run(model.generate_batch(messages))
for record, response in zip(batch_records, responses):
record.response = response
if record.response is None:
record.request_status = 'error'
else:
record.request_status = 'success'
except Exception as e:
print(f"Error during batch generation: {e}")
for record in batch_records:
record.response = str(e)
record.request_status = 'error'
if batch_idx % args.save_batch_size == 0:
dataset_handler.save_results(save_file_path)
else:
# Sequential generation
for idx, record in tqdm(enumerate(not_processed_records), total=len(not_processed_records),
desc=f"Evaluating {args.model_name} on {args.dataset_name}",
dynamic_ncols=True):
if record.request_status == 'success':
continue
try:
response = model.generate(record.message)
record.response = response
record.request_status = 'success'
except Exception as e:
print(f"Error during model generation for record {record.id}: {e}")
record.response = str(e)
record.request_status = 'error'
if idx % args.save_batch_size == 0:
dataset_handler.save_results(save_file_path)
dataset_handler.save_results(save_file_path)
# 3. Metric calculation
elif args.mode == "scoring":
if args.scorer_api_url != "":
print("Loading scorer with vLLM API")
score_client = get_model(
model_name="VLLMClient",
model_path="",
api_url=args.scorer_api_url,
system_prompt="You are a helpful assistant."
)
else:
print("Loading scorer with HuggingFace")
score_client = get_model(model_name="UNOScorerHF", model_path=args.scorer_model_path)
score_client.load_model()
dataset_handler.compute_metrics(score_client, save_file_path)
dataset_handler.save_results(save_file_path)
if __name__ == "__main__":
main()