| import os |
| import pandas as pd |
| import argparse |
| from typing import List, Dict, Any |
| from evaluate import QwenEvaluator |
|
|
| def run_benchmark(model_id: str, dataset_path: str, num_samples: int = 10): |
| print(f"Benchmarking model: {model_id} on {dataset_path}") |
| |
| |
| try: |
| evaluator = QwenEvaluator(model_id=model_id) |
| evaluator.setup_model() |
| |
| |
| df = pd.read_json(dataset_path, orient="records", lines=True).head(num_samples) |
| |
| results = [] |
| for i, row in df.iterrows(): |
| print(f"Evaluating sample {i+1}/{num_samples}") |
| instruction = row.get("instruction", "") |
| |
| |
| if not torch.cuda.is_available(): |
| print("CUDA not available. Simulating response...") |
| response_clean = "<reasoning>\nSimulation of complex reasoning process...\n</reasoning>\n<answer>\nSimulation answer.\n</answer>" |
| else: |
| inputs = evaluator.tokenizer( |
| [f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"], |
| return_tensors="pt" |
| ).to("cuda") |
|
|
| outputs = evaluator.model.generate(**inputs, max_new_tokens=1024, use_cache=True) |
| response = evaluator.tokenizer.batch_decode(outputs)[0] |
| response_clean = response.split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "").strip() |
| |
| results.append({ |
| "instruction": instruction, |
| "ground_truth": row.get("output", ""), |
| "model_response": response_clean |
| }) |
| |
| results_df = pd.DataFrame(results) |
| |
| |
| report_path = f"benchmark_report_{model_id.replace('/', '_')}.jsonl" |
| results_df.to_json(report_path, orient="records", lines=True) |
| print(f"Raw benchmark results saved to {report_path}") |
|
|
| try: |
| |
| judged_df = evaluator.judge_responses(results_df, "Complex reasoning and multi-step math/logic") |
| |
| judged_df.to_json(report_path, orient="records", lines=True) |
| print(f"Judged benchmark report saved to {report_path}") |
| |
| avg_score = judged_df["judge_score"].mean() if "judge_score" in judged_df.columns else 0 |
| print(f"Average Judge Score: {avg_score:.2f}") |
| except Exception as judge_e: |
| print(f"Judging failed: {judge_e}") |
| print("Proceeding with raw results.") |
| |
| except Exception as e: |
| print(f"Benchmark failed: {e}") |
| print("Note: 7B models require significant GPU memory. Ensure you are running this on a T4 x2 or A100 instance.") |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Benchmark a Qwen model on Reasoning Assistant") |
| parser.add_argument("--model", type=str, default="Qwen/Qwen3.5-7B", help="Model ID") |
| parser.add_argument("--dataset", type=str, default="reasoning_assistant_v2_10.jsonl", help="Dataset path") |
| parser.add_argument("--num", type=int, default=10, help="Number of samples") |
| |
| args = parser.parse_args() |
| |
| |
| import torch |
| |
| run_benchmark(args.model, args.dataset, args.num) |
|
|