| import os |
| import sys |
| import argparse |
| import json |
| from tqdm import tqdm |
| import asyncio |
|
|
| from models import get_model, VLLMClient |
| from benchmarks import get_dataset |
|
|
| def setup_arg_parser(): |
| parser = argparse.ArgumentParser(description="Run evaluation on a given model and dataset.") |
| parser.add_argument("--model_name", type=str, required=True, help="Registered name of the model type (e.g., 'Qwen-2.5-Omni-7B').") |
| parser.add_argument("--model_path", type=str, default="", help="Path to the inference model.") |
| parser.add_argument("--model_api_url", type=str, default="", help="API url for the model.") |
| parser.add_argument("--system_prompt", type=str, default="", help="System prompt for the model.") |
| parser.add_argument("--batch_size", type=int, default=1, help="Batch size for model generation.") |
| parser.add_argument("--save_batch_size", type=int, default=128, help="Batch size for saving results.") |
|
|
| parser.add_argument("--dataset_name", type=str, required=True, help="Registered name of the dataset (e.g., 'UNO-Bench').") |
| parser.add_argument("--subset_name", type=str, default="", help="Subset name of the dataset.") |
| parser.add_argument("--dataset_local_dir", type=str, default="", help="Local path to the dataset.") |
| parser.add_argument("--hf_cache_dir", type=str, default="~/.cache/huggingface/hub", help="Hugging Face cache directory.") |
| |
| parser.add_argument("--output_dir", type=str, default="./eval_results", help="Directory to save evaluation results.") |
| parser.add_argument("--exp_marking", type=str, default="", help="Experiment marking.") |
| parser.add_argument("--scorer_api_url", type=str, default="", help="The score model API url.") |
| parser.add_argument("--scorer_model_path", type=str, default="", help="The scorer model path.") |
| parser.add_argument("--mode", choices=["inference", "scoring"], default="inference") |
|
|
| return parser.parse_args() |
|
|
| def main(): |
| args = setup_arg_parser() |
| print("Evaluation starting with the following configuration:") |
| print(json.dumps(vars(args), indent=2)) |
| if os.path.exists(args.output_dir) is False: |
| os.makedirs(args.output_dir) |
| save_file_path = os.path.join(args.output_dir, f"{args.model_name}{args.exp_marking}:{args.dataset_name}.json") |
| |
| |
| try: |
| dataset_handler = get_dataset(args.dataset_name) |
| dataset_kwargs = {} |
| if args.dataset_local_dir: |
| dataset_kwargs['local_dir'] = args.dataset_local_dir |
| if args.hf_cache_dir: |
| dataset_kwargs['hf_cache_dir'] = args.hf_cache_dir |
| if args.subset_name: |
| dataset_kwargs['subset_name'] = args.subset_name |
| dataset_handler.load_and_prepare(**dataset_kwargs) |
| if os.path.exists(save_file_path): |
| dataset_handler.load_results(save_file_path) |
| except Exception as e: |
| print(f"Error preparing dataset: {e}") |
| return |
|
|
| not_processed_records = [record for record in dataset_handler.evaluation_records |
| if record.request_status != 'success' or record.response is None] |
|
|
| |
| if args.mode == "inference" and len(not_processed_records)>0: |
| try: |
| model_kwargs = {} |
| if args.model_api_url != "": |
| model_kwargs['api_url'] = args.model_api_url |
| if args.system_prompt != "": |
| model_kwargs['system_prompt'] = args.system_prompt |
| model = get_model(args.model_name, args.model_path, **model_kwargs) |
| model.load_model() |
| except (ValueError, ImportError) as e: |
| print(f"Error initializing model: {e}") |
| return |
|
|
| batch_size = args.batch_size |
| |
| if batch_size > 1: |
| |
| for batch_idx in tqdm(range(0, len(not_processed_records), batch_size), |
| desc=f"Evaluating {args.model_name} on {args.dataset_name}", |
| dynamic_ncols=True): |
| batch_records = not_processed_records[batch_idx:batch_idx + batch_size] |
| try: |
| messages = [record.message for record in batch_records] |
| responses = asyncio.run(model.generate_batch(messages)) |
| for record, response in zip(batch_records, responses): |
| record.response = response |
| if record.response is None: |
| record.request_status = 'error' |
| else: |
| record.request_status = 'success' |
| except Exception as e: |
| print(f"Error during batch generation: {e}") |
| for record in batch_records: |
| record.response = str(e) |
| record.request_status = 'error' |
| |
| if batch_idx % args.save_batch_size == 0: |
| dataset_handler.save_results(save_file_path) |
| else: |
| |
| for idx, record in tqdm(enumerate(not_processed_records), total=len(not_processed_records), |
| desc=f"Evaluating {args.model_name} on {args.dataset_name}", |
| dynamic_ncols=True): |
| if record.request_status == 'success': |
| continue |
| try: |
| response = model.generate(record.message) |
| record.response = response |
| record.request_status = 'success' |
| except Exception as e: |
| print(f"Error during model generation for record {record.id}: {e}") |
| record.response = str(e) |
| record.request_status = 'error' |
|
|
| if idx % args.save_batch_size == 0: |
| dataset_handler.save_results(save_file_path) |
| |
| dataset_handler.save_results(save_file_path) |
| |
| |
| elif args.mode == "scoring": |
| if args.scorer_api_url != "": |
| print("Loading scorer with vLLM API") |
| score_client = get_model( |
| model_name="VLLMClient", |
| model_path="", |
| api_url=args.scorer_api_url, |
| system_prompt="You are a helpful assistant." |
| ) |
| else: |
| print("Loading scorer with HuggingFace") |
| score_client = get_model(model_name="UNOScorerHF", model_path=args.scorer_model_path) |
|
|
| score_client.load_model() |
| dataset_handler.compute_metrics(score_client, save_file_path) |
| dataset_handler.save_results(save_file_path) |
|
|
| if __name__ == "__main__": |
| main() |