Upload training_scripts/phase3_eval.py with huggingface_hub

289b193 verified about 1 month ago

2.96 kB

	#!/usr/bin/env python3
	"""Phase 3: Evaluation on HumanEval+ and MBPP+"""

	import subprocess
	from pathlib import Path

	# Config
	CKPT_DIR = Path("./qwen3_pipeline/checkpoint")
	RESULTS_DIR = Path("./qwen3_pipeline/results")
	RESULTS_DIR.mkdir(parents=True, exist_ok=True)

	model_path = str(CKPT_DIR / "merged")

	print("="*70)
	print("PHASE 3: EVALUATION")
	print("="*70)
	print(f"\nModel: {model_path}\n")

	# Check if model exists
	if not (CKPT_DIR / "merged").exists():
	print(f"❌ Model not found at {model_path}")
	print(f" Did Phase 2 complete successfully?")
	exit(1)

	# HumanEval+
	print("="*70)
	print("HUMANEVAL+")
	print("="*70)

	humaneval_log = RESULTS_DIR / "humaneval_plus.log"

	cmd_humaneval = (
	f'evalplus.evaluate '
	f'--model "{model_path}" '
	f'--dataset humaneval '
	f'--backend vllm '
	f'--greedy '
	f'--tp 1'
	)

	print(f"\nRunning: {cmd_humaneval}\n")

	try:
	result = subprocess.run(
	cmd_humaneval,
	shell=True,
	capture_output=True,
	text=True,
	timeout=1800 # 30 min timeout
	)

	# Save full log
	with open(humaneval_log, "w") as f:
	f.write(result.stdout)
	f.write("\n\n=== STDERR ===\n\n")
	f.write(result.stderr)

	# Print key results
	print("\n" + "="*70)
	print("HUMANEVAL+ RESULTS")
	print("="*70)

	for line in result.stdout.split("\n"):
	if any(k in line.lower() for k in ["pass@1", "base", "plus", "score"]):
	print(line)

	print(f"\nFull log: {humaneval_log}")

	except subprocess.TimeoutExpired:
	print("❌ HumanEval+ timed out (30 min)")
	except Exception as e:
	print(f"❌ HumanEval+ failed: {e}")

	# MBPP+
	print("\n" + "="*70)
	print("MBPP+")
	print("="*70)

	mbpp_log = RESULTS_DIR / "mbpp_plus.log"

	cmd_mbpp = (
	f'evalplus.evaluate '
	f'--model "{model_path}" '
	f'--dataset mbpp '
	f'--backend vllm '
	f'--greedy '
	f'--tp 1'
	)

	print(f"\nRunning: {cmd_mbpp}\n")

	try:
	result = subprocess.run(
	cmd_mbpp,
	shell=True,
	capture_output=True,
	text=True,
	timeout=1800 # 30 min timeout
	)

	# Save full log
	with open(mbpp_log, "w") as f:
	f.write(result.stdout)
	f.write("\n\n=== STDERR ===\n\n")
	f.write(result.stderr)

	# Print key results
	print("\n" + "="*70)
	print("MBPP+ RESULTS")
	print("="*70)

	for line in result.stdout.split("\n"):
	if any(k in line.lower() for k in ["pass@1", "base", "plus", "score"]):
	print(line)

	print(f"\nFull log: {mbpp_log}")

	except subprocess.TimeoutExpired:
	print("❌ MBPP+ timed out (30 min)")
	except Exception as e:
	print(f"❌ MBPP+ failed: {e}")

	# Summary
	print("\n" + "="*70)
	print("✓ PHASE 3 COMPLETE")
	print("="*70)
	print(f"\nResults saved to: {RESULTS_DIR}/")
	print(f" - {humaneval_log}")
	print(f" - {mbpp_log}")
	print(f"\n➡️ Next: python phase4_codet.py")