| |
| """Phase 3: Evaluation on HumanEval+ and MBPP+""" |
|
|
| import subprocess |
| from pathlib import Path |
|
|
| |
| CKPT_DIR = Path("./qwen3_pipeline/checkpoint") |
| RESULTS_DIR = Path("./qwen3_pipeline/results") |
| RESULTS_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| model_path = str(CKPT_DIR / "merged") |
|
|
| print("="*70) |
| print("PHASE 3: EVALUATION") |
| print("="*70) |
| print(f"\nModel: {model_path}\n") |
|
|
| |
| if not (CKPT_DIR / "merged").exists(): |
| print(f"❌ Model not found at {model_path}") |
| print(f" Did Phase 2 complete successfully?") |
| exit(1) |
|
|
| |
| print("="*70) |
| print("HUMANEVAL+") |
| print("="*70) |
|
|
| humaneval_log = RESULTS_DIR / "humaneval_plus.log" |
|
|
| cmd_humaneval = ( |
| f'evalplus.evaluate ' |
| f'--model "{model_path}" ' |
| f'--dataset humaneval ' |
| f'--backend vllm ' |
| f'--greedy ' |
| f'--tp 1' |
| ) |
|
|
| print(f"\nRunning: {cmd_humaneval}\n") |
|
|
| try: |
| result = subprocess.run( |
| cmd_humaneval, |
| shell=True, |
| capture_output=True, |
| text=True, |
| timeout=1800 |
| ) |
| |
| |
| with open(humaneval_log, "w") as f: |
| f.write(result.stdout) |
| f.write("\n\n=== STDERR ===\n\n") |
| f.write(result.stderr) |
| |
| |
| print("\n" + "="*70) |
| print("HUMANEVAL+ RESULTS") |
| print("="*70) |
| |
| for line in result.stdout.split("\n"): |
| if any(k in line.lower() for k in ["pass@1", "base", "plus", "score"]): |
| print(line) |
| |
| print(f"\nFull log: {humaneval_log}") |
| |
| except subprocess.TimeoutExpired: |
| print("❌ HumanEval+ timed out (30 min)") |
| except Exception as e: |
| print(f"❌ HumanEval+ failed: {e}") |
|
|
| |
| print("\n" + "="*70) |
| print("MBPP+") |
| print("="*70) |
|
|
| mbpp_log = RESULTS_DIR / "mbpp_plus.log" |
|
|
| cmd_mbpp = ( |
| f'evalplus.evaluate ' |
| f'--model "{model_path}" ' |
| f'--dataset mbpp ' |
| f'--backend vllm ' |
| f'--greedy ' |
| f'--tp 1' |
| ) |
|
|
| print(f"\nRunning: {cmd_mbpp}\n") |
|
|
| try: |
| result = subprocess.run( |
| cmd_mbpp, |
| shell=True, |
| capture_output=True, |
| text=True, |
| timeout=1800 |
| ) |
| |
| |
| with open(mbpp_log, "w") as f: |
| f.write(result.stdout) |
| f.write("\n\n=== STDERR ===\n\n") |
| f.write(result.stderr) |
| |
| |
| print("\n" + "="*70) |
| print("MBPP+ RESULTS") |
| print("="*70) |
| |
| for line in result.stdout.split("\n"): |
| if any(k in line.lower() for k in ["pass@1", "base", "plus", "score"]): |
| print(line) |
| |
| print(f"\nFull log: {mbpp_log}") |
| |
| except subprocess.TimeoutExpired: |
| print("❌ MBPP+ timed out (30 min)") |
| except Exception as e: |
| print(f"❌ MBPP+ failed: {e}") |
|
|
| |
| print("\n" + "="*70) |
| print("✓ PHASE 3 COMPLETE") |
| print("="*70) |
| print(f"\nResults saved to: {RESULTS_DIR}/") |
| print(f" - {humaneval_log}") |
| print(f" - {mbpp_log}") |
| print(f"\n➡️ Next: python phase4_codet.py") |
|
|