File size: 4,519 Bytes
0d95482 adea8c3 0d95482 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | #!/usr/bin/env python3
"""
Batch evaluation: runs all 30 scenarios and prints a summary report.
Usage: python scripts/evaluate.py --url http://localhost:7860 --agent keyword --output results.json
"""
import argparse
import sys
import json
import time
from pathlib import Path
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from scripts.baseline import KeywordAgent, LLMAgent, run_episode, save_results
TASKS = ["bug_detection", "security_audit", "architectural_review"]
SEEDS = list(range(10))
def run_batch_evaluation(url: str, agent, verbose: bool = False) -> list:
"""Run all 30 scenarios and return results."""
all_results = []
for task in TASKS:
print(f"\n\u2500\u2500 Task: {task} \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500")
for seed in SEEDS:
try:
result = run_episode(url, task, seed, agent, verbose)
all_results.append(result)
score = result["final_score"]
bar = "\u2588" * int(score * 10) + "\u2591" * (10 - int(score * 10))
print(f" Seed {seed:2d}: [{bar}] {score:.3f} ({result['issues_found']}/{result['issues_total']} issues)")
except Exception as e:
print(f" Seed {seed:2d}: FAILED \u2014 {e}")
all_results.append({"task_id": task, "seed": seed, "final_score": 0.0, "error": str(e)})
return all_results
def print_summary(results: list):
"""Print a formatted summary report."""
from collections import defaultdict
import statistics
print("\n" + "="*60)
print("EVALUATION SUMMARY")
print("="*60)
by_task = defaultdict(list)
for r in results:
if "error" not in r:
by_task[r["task_id"]].append(r["final_score"])
overall_scores = [s for scores in by_task.values() for s in scores]
for task, scores in by_task.items():
if scores:
print(f"\n{task.upper().replace('_', ' ')}")
print(f" Mean: {statistics.mean(scores):.3f}")
print(f" Median: {statistics.median(scores):.3f}")
print(f" Stdev: {statistics.stdev(scores) if len(scores) > 1 else 0:.3f}")
print(f" Best: {max(scores):.3f}")
print(f" Worst: {min(scores):.3f}")
if overall_scores:
print(f"\nOVERALL ({len(overall_scores)}/30 scenarios)")
print(f" Mean score: {statistics.mean(overall_scores):.3f}")
print(f" Success rate (>0.5): {sum(1 for s in overall_scores if s > 0.5)/len(overall_scores)*100:.1f}%")
print("="*60)
def main():
parser = argparse.ArgumentParser(description="Batch evaluation of all 30 CodeLens scenarios")
parser.add_argument("--url", default="http://localhost:7860")
parser.add_argument("--agent", default="keyword", choices=["keyword", "llm"])
parser.add_argument("--api-key", default="")
parser.add_argument("--output", default="results.json", help="Output file (.json or .csv)")
parser.add_argument("--verbose", action="store_true")
parser.add_argument("--task", default=None,
choices=["bug_detection", "security_audit", "architectural_review", None],
help="Run only a specific task (default: all)")
args = parser.parse_args()
if args.agent == "llm":
import os
api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY", "")
if not api_key:
print("ERROR: LLM agent requires --api-key or ANTHROPIC_API_KEY env var")
sys.exit(1)
agent = LLMAgent(api_key)
else:
agent = KeywordAgent()
# Check connectivity
try:
import requests
requests.get(f"{args.url}/health", timeout=5).raise_for_status()
except Exception as e:
print(f"ERROR: Cannot connect to {args.url}: {e}")
sys.exit(1)
global TASKS
if args.task:
TASKS = [args.task]
print(f"Running evaluation: {len(TASKS)} task(s), {len(SEEDS)} seeds each")
print(f"Agent: {args.agent} | API: {args.url}")
start = time.time()
results = run_batch_evaluation(args.url, agent, args.verbose)
print(f"\nCompleted in {time.time()-start:.1f}s")
print_summary(results)
if args.output:
save_results(results, args.output)
print(f"\nResults saved to: {args.output}")
if __name__ == "__main__":
main()
|