mindchain
/

qwen-trainer-scripts

Model card Files Files and versions

qwen-trainer-scripts / benchmark.py

mindchain's picture

Upload folder using huggingface_hub

78a0ca9 verified about 1 month ago

history blame contribute delete

3.52 kB

	import os
	import pandas as pd
	import argparse
	from typing import List, Dict, Any
	from evaluate import QwenEvaluator

	def run_benchmark(model_id: str, dataset_path: str, num_samples: int = 10):
	print(f"Benchmarking model: {model_id} on {dataset_path}")

	# We can't actually run 7B here without GPU, but we provide the logic
	try:
	evaluator = QwenEvaluator(model_id=model_id)
	evaluator.setup_model()

	# Load local dataset
	df = pd.read_json(dataset_path, orient="records", lines=True).head(num_samples)

	results = []
	for i, row in df.iterrows():
	print(f"Evaluating sample {i+1}/{num_samples}")
	instruction = row.get("instruction", "")

	# Simple simulation for local runs without GPU
	if not torch.cuda.is_available():
	print("CUDA not available. Simulating response...")
	response_clean = "<reasoning>\nSimulation of complex reasoning process...\n</reasoning>\n<answer>\nSimulation answer.\n</answer>"
	else:
	inputs = evaluator.tokenizer(
	[f"<\|im_start\|>user\n{instruction}<\|im_end\|>\n<\|im_start\|>assistant\n"],
	return_tensors="pt"
	).to("cuda")

	outputs = evaluator.model.generate(**inputs, max_new_tokens=1024, use_cache=True)
	response = evaluator.tokenizer.batch_decode(outputs)[0]
	response_clean = response.split("<\|im_start\|>assistant\n")[-1].replace("<\|im_end\|>", "").strip()

	results.append({
	"instruction": instruction,
	"ground_truth": row.get("output", ""),
	"model_response": response_clean
	})

	results_df = pd.DataFrame(results)

	# Save raw results first
	report_path = f"benchmark_report_{model_id.replace('/', '_')}.jsonl"
	results_df.to_json(report_path, orient="records", lines=True)
	print(f"Raw benchmark results saved to {report_path}")

	try:
	# Judge the results
	judged_df = evaluator.judge_responses(results_df, "Complex reasoning and multi-step math/logic")
	# Save judged results
	judged_df.to_json(report_path, orient="records", lines=True)
	print(f"Judged benchmark report saved to {report_path}")

	avg_score = judged_df["judge_score"].mean() if "judge_score" in judged_df.columns else 0
	print(f"Average Judge Score: {avg_score:.2f}")
	except Exception as judge_e:
	print(f"Judging failed: {judge_e}")
	print("Proceeding with raw results.")

	except Exception as e:
	print(f"Benchmark failed: {e}")
	print("Note: 7B models require significant GPU memory. Ensure you are running this on a T4 x2 or A100 instance.")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Benchmark a Qwen model on Reasoning Assistant")
	parser.add_argument("--model", type=str, default="Qwen/Qwen3.5-7B", help="Model ID")
	parser.add_argument("--dataset", type=str, default="reasoning_assistant_v2_10.jsonl", help="Dataset path")
	parser.add_argument("--num", type=int, default=10, help="Number of samples")

	args = parser.parse_args()

	# Import torch here to avoid error if not installed in some envs
	import torch

	run_benchmark(args.model, args.dataset, args.num)