bee / bee /eval_harness.py
Bee Deploy
HF Space backend deploy [de0cba5]
5e21013
#!/usr/bin/env python3
"""Bee Evaluation Harness — measure before you optimize.
Runs reproducible benchmarks on any model checkpoint or base model.
Produces JSON reports for regression tracking and baseline comparisons.
Usage:
python -m bee.eval_harness --model HuggingFaceTB/SmolLM2-360M-Instruct --device mps
python -m bee.eval_harness --model ./autopilot_checkpoints/iter_100 --device cuda
Benchmarks:
- coding: 10 simple function implementation tasks
- reasoning: 10 math/logic puzzles
- instruct: 10 structured output compliance checks
- grounded: 5 fact-based QA with known answers
- domain: 5 domain-specific questions (programming, quantum, etc.)
"""
import argparse
import json
import logging
import re
import sys
import time
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Callable, Dict, List
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
logger = logging.getLogger("bee.eval")
@dataclass
class EvalResult:
benchmark: str
score: float # 0.0 - 1.0
total: int
passed: int
latency_ms: float
details: List[dict]
def _generate(model, tokenizer, prompt: str, max_new_tokens: int = 128, temperature: float = 0.3) -> str:
"""Generate text from a prompt, returning decoded output.
Uses chat template for instruct models, falls back to raw prompt.
"""
if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template:
chat = [{"role": "user", "content": prompt}]
text = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
else:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=True if temperature > 0 else False,
temperature=temperature,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
)
gen = outputs[0][inputs["input_ids"].shape[1]:]
return tokenizer.decode(gen, skip_special_tokens=True).strip()
# ── Benchmark: Coding ─────────────────────────────────────────────────────────
CODING_TASKS = [
{
"prompt": "Write a Python function that returns the factorial of n.",
"checks": [
lambda s: "def factorial" in s.lower(),
lambda s: "return" in s,
],
},
{
"prompt": "Write a Python function is_palindrome(s) that returns True if a string is a palindrome.",
"checks": [
lambda s: "def is_palindrome" in s.lower(),
lambda s: "return" in s,
],
},
{
"prompt": "Write a Python function fibonacci(n) that returns the nth Fibonacci number.",
"checks": [
lambda s: "def fibonacci" in s.lower(),
lambda s: "return" in s,
],
},
{
"prompt": "Write a Python function reverse_list(lst) that returns a reversed copy of a list.",
"checks": [
lambda s: "def reverse_list" in s.lower(),
lambda s: "return" in s,
],
},
{
"prompt": "Write a Python function sum_even_numbers(numbers) that sums only the even integers in a list.",
"checks": [
lambda s: "def sum_even_numbers" in s.lower(),
lambda s: "return" in s,
],
},
{
"prompt": "Write a Python function count_vowels(s) that counts the vowels in a string.",
"checks": [
lambda s: "def count_vowels" in s.lower(),
lambda s: "return" in s,
],
},
{
"prompt": "Write a Python function max_of_three(a, b, c) that returns the largest of three numbers.",
"checks": [
lambda s: "def max_of_three" in s.lower(),
lambda s: "return" in s,
],
},
{
"prompt": "Write a Python function merge_dicts(d1, d2) that merges two dictionaries.",
"checks": [
lambda s: "def merge_dicts" in s.lower(),
lambda s: "return" in s,
],
},
{
"prompt": "Write a Python function remove_duplicates(lst) that removes duplicates from a list while preserving order.",
"checks": [
lambda s: "def remove_duplicates" in s.lower(),
lambda s: "return" in s,
],
},
{
"prompt": "Write a Python function fahrenheit_to_celsius(f) that converts Fahrenheit to Celsius.",
"checks": [
lambda s: "def fahrenheit_to_celsius" in s.lower(),
lambda s: "return" in s,
],
},
]
def run_coding_benchmark(model, tokenizer) -> EvalResult:
"""Check if model produces syntactically valid function definitions."""
details = []
passed = 0
t0 = time.perf_counter()
for task in CODING_TASKS:
output = _generate(model, tokenizer, task["prompt"], max_new_tokens=128)
ok = all(check(output) for check in task["checks"])
passed += int(ok)
details.append({"prompt": task["prompt"], "output": output[:200], "pass": ok})
latency = (time.perf_counter() - t0) * 1000 / len(CODING_TASKS)
return EvalResult("coding", passed / len(CODING_TASKS), len(CODING_TASKS), passed, latency, details)
# ── Benchmark: Reasoning ────────────────────────────────────────────────────
REASONING_TASKS = [
{
"prompt": "What is 17 + 25? Answer with just the number.",
"answer": "42",
"match": lambda out, ans: ans in out,
},
{
"prompt": "If a train travels 60 km per hour, how far does it go in 2.5 hours? Answer with just the number.",
"answer": "150",
"match": lambda out, ans: ans in out,
},
{
"prompt": "What is the square root of 144? Answer with just the number.",
"answer": "12",
"match": lambda out, ans: ans in out,
},
{
"prompt": "A bat and a ball cost $11 total. The bat costs $10 more than the ball. How much does the ball cost? Answer with just the number.",
"answer": "0.5",
"match": lambda out, ans: any(a in out for a in ["0.5", "$0.5", "50 cents"]),
},
{
"prompt": "How many prime numbers are there between 1 and 10? Answer with just the number.",
"answer": "4",
"match": lambda out, ans: ans in out,
},
{
"prompt": "If it takes 5 machines 5 minutes to make 5 widgets, how long does it take 100 machines to make 100 widgets? Answer in minutes.",
"answer": "5",
"match": lambda out, ans: ans in out,
},
{
"prompt": "What is the capital of France? One word.",
"answer": "Paris",
"match": lambda out, ans: ans.lower() in out.lower(),
},
{
"prompt": "What is 2 to the power of 10? Answer with just the number.",
"answer": "1024",
"match": lambda out, ans: ans in out,
},
{
"prompt": "What is the next number in the sequence: 2, 4, 8, 16, ? Answer with just the number.",
"answer": "32",
"match": lambda out, ans: ans in out,
},
{
"prompt": "If today is Monday, what day will it be in 10 days? One word.",
"answer": "Thursday",
"match": lambda out, ans: ans.lower() in out.lower(),
},
]
def run_reasoning_benchmark(model, tokenizer) -> EvalResult:
details = []
passed = 0
t0 = time.perf_counter()
for task in REASONING_TASKS:
output = _generate(model, tokenizer, task["prompt"], max_new_tokens=20, temperature=0.0)
ok = task["match"](output, task["answer"])
passed += int(ok)
details.append({"prompt": task["prompt"], "output": output, "expected": task["answer"], "pass": ok})
latency = (time.perf_counter() - t0) * 1000 / len(REASONING_TASKS)
return EvalResult("reasoning", passed / len(REASONING_TASKS), len(REASONING_TASKS), passed, latency, details)
# ── Benchmark: Instruction Following ──────────────────────────────────────────
INSTRUCT_TASKS = [
{
"prompt": 'Answer the following in JSON format only: {"answer": "hello"}',
"check": lambda s: bool('{"answer": "hello"}' in s or '{"answer": "hello"}' in s.replace(" ", "")),
},
{
"prompt": "Summarize the following in exactly 3 bullet points:\n- Point A\n- Point B\n- Point C\n- Point D",
"check": lambda s: bool(s.count("\n-") == 3 or s.count("\n*") == 3 or s.count("\n") >= 3),
},
{
"prompt": "Translate 'Hello, how are you?' to French. Output only the translation.",
"check": lambda s: bool("bonjour" in s.lower() and "comment" in s.lower()),
},
{
"prompt": "List three colors. Format: 1. Color 1, 2. Color 2, 3. Color 3",
"check": lambda s: bool(re.search(r"1\.\s*\w", s) and re.search(r"3\.\s*\w", s)),
},
{
"prompt": "Write a haiku about the moon. It must have exactly 3 lines.",
"check": lambda s: bool(s.strip().count("\n") == 2),
},
{
"prompt": "Answer with exactly one word: What is the fastest land animal?",
"check": lambda s: bool(len(s.strip().split()) <= 2),
},
{
"prompt": "Capitalize every letter in the following: hello world",
"check": lambda s: bool("HELLO WORLD" in s),
},
{
"prompt": "Write the numbers 1 to 5 separated by commas only.",
"check": lambda s: bool("1,2,3,4,5" in s.replace(" ", "") or "1, 2, 3, 4, 5" in s),
},
{
"prompt": "Respond with 'CONFIRMED' in all caps and nothing else.",
"check": lambda s: bool("CONFIRMED" in s and len(s.strip().split()) <= 2),
},
{
"prompt": "Sort these words alphabetically: zebra, apple, mango. Output only the sorted list.",
"check": lambda s: bool("apple" in s and "mango" in s and "zebra" in s),
},
]
def run_instruct_benchmark(model, tokenizer) -> EvalResult:
details = []
passed = 0
t0 = time.perf_counter()
for task in INSTRUCT_TASKS:
output = _generate(model, tokenizer, task["prompt"], max_new_tokens=64, temperature=0.0)
ok = task["check"](output)
passed += int(ok)
details.append({"prompt": task["prompt"], "output": output, "pass": ok})
latency = (time.perf_counter() - t0) * 1000 / len(INSTRUCT_TASKS)
return EvalResult("instruct", passed / len(INSTRUCT_TASKS), len(INSTRUCT_TASKS), passed, latency, details)
# ── Benchmark: Grounded / Hallucination ───────────────────────────────────────
GROUNDED_TASKS = [
{
"prompt": "What is the capital of Japan? One word.",
"answer": "Tokyo",
"check": lambda s: "tokyo" in s.lower(),
},
{
"prompt": "Who wrote 'Pride and Prejudice'? One name.",
"answer": "Jane Austen",
"check": lambda s: "austen" in s.lower(),
},
{
"prompt": "What is the chemical symbol for gold?",
"answer": "Au",
"check": lambda s: "au" in s.lower().split() or s.strip().upper() == "AU",
},
{
"prompt": "How many continents are there? Answer with just the number.",
"answer": "7",
"check": lambda s: "7" in s,
},
{
"prompt": "What is the speed of light in a vacuum, in meters per second? Use scientific notation: 3e8.",
"answer": "3e8",
"check": lambda s: "3e8" in s or "300000000" in s or "299792458" in s,
},
]
def run_grounded_benchmark(model, tokenizer) -> EvalResult:
details = []
passed = 0
t0 = time.perf_counter()
for task in GROUNDED_TASKS:
output = _generate(model, tokenizer, task["prompt"], max_new_tokens=20, temperature=0.0)
ok = task["check"](output)
passed += int(ok)
details.append({"prompt": task["prompt"], "output": output, "expected": task["answer"], "pass": ok})
latency = (time.perf_counter() - t0) * 1000 / len(GROUNDED_TASKS)
return EvalResult("grounded", passed / len(GROUNDED_TASKS), len(GROUNDED_TASKS), passed, latency, details)
# ── Benchmark: Domain (Programming / Quantum / Fintech) ─────────────────────
DOMAIN_TASKS = [
{
"prompt": "In Python, what function converts a string to an integer? One function name.",
"check": lambda s: bool("int(" in s or s.strip().lower() == "int"),
},
{
"prompt": "What is a qubit in one sentence?",
"check": lambda s: bool("quantum" in s.lower() and ("bit" in s.lower() or "state" in s.lower() or "superposition" in s.lower())),
},
{
"prompt": "What does 'blockchain' mean in one sentence?",
"check": lambda s: bool("ledger" in s.lower() or "decentralized" in s.lower() or "distributed" in s.lower()),
},
{
"prompt": "In cybersecurity, what does 'MITM' stand for? Give the full phrase.",
"check": lambda s: bool("man-in-the-middle" in s.lower() or "man in the middle" in s.lower()),
},
{
"prompt": "What is a 'smart contract' in one sentence?",
"check": lambda s: bool("self-executing" in s.lower() or "automatically" in s.lower() or "blockchain" in s.lower() or "code" in s.lower()),
},
]
def run_domain_benchmark(model, tokenizer) -> EvalResult:
details = []
passed = 0
t0 = time.perf_counter()
for task in DOMAIN_TASKS:
output = _generate(model, tokenizer, task["prompt"], max_new_tokens=64, temperature=0.0)
ok = task["check"](output)
passed += int(ok)
details.append({"prompt": task["prompt"], "output": output, "pass": ok})
latency = (time.perf_counter() - t0) * 1000 / len(DOMAIN_TASKS)
return EvalResult("domain", passed / len(DOMAIN_TASKS), len(DOMAIN_TASKS), passed, latency, details)
# ── Harness ─────────────────────────────────────────────────────────────────
BENCHMARKS = {
"coding": run_coding_benchmark,
"reasoning": run_reasoning_benchmark,
"instruct": run_instruct_benchmark,
"grounded": run_grounded_benchmark,
"domain": run_domain_benchmark,
}
def load_model(model_path: str, device: str):
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
torch_dtype=torch.float16 if device == "mps" else None,
).to(device)
model.eval()
return model, tokenizer
def run_all_benchmarks(model, tokenizer, benchmarks: List[str] | None = None) -> List[EvalResult]:
"""Run benchmarks against an already-loaded (model, tokenizer) pair.
Differs from `run_all`, which takes a model path and loads/saves a JSON
report. This variant is for callers that already hold a live model in
memory — currently `bee.evolution._run_baseline_eval`, which evaluates
the running server's model without re-loading from disk.
"""
names = benchmarks or list(BENCHMARKS.keys())
out: List[EvalResult] = []
for name in names:
fn = BENCHMARKS.get(name)
if fn is None:
logger.warning("Unknown benchmark: %s", name)
continue
out.append(fn(model, tokenizer))
return out
def run_all(model_path: str, device: str, output_path: str = None, benchmarks: List[str] = None) -> Dict:
"""Run selected benchmarks and return/save results."""
benchmarks = benchmarks or list(BENCHMARKS.keys())
logger.info("Loading model: %s", model_path)
model, tokenizer = load_model(model_path, device)
n_params = sum(p.numel() for p in model.parameters()) / 1e6
logger.info("Model loaded: %.1fM params on %s", n_params, device)
results = {}
t_start = time.perf_counter()
for name in benchmarks:
if name not in BENCHMARKS:
logger.warning("Unknown benchmark: %s", name)
continue
logger.info("Running benchmark: %s", name)
result = BENCHMARKS[name](model, tokenizer)
results[name] = asdict(result)
logger.info(
" %s: %.0f%% (%d/%d) avg_latency=%.0fms",
name, result.score * 100, result.passed, result.total, result.latency_ms,
)
total_time = time.perf_counter() - t_start
report = {
"model": model_path,
"device": device,
"params_m": round(n_params, 1),
"total_time_s": round(total_time, 1),
"benchmarks": results,
"overall_score": round(sum(r["score"] for r in results.values()) / len(results), 3),
}
if output_path:
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
json.dump(report, f, indent=2)
logger.info("Report saved: %s", output_path)
return report
def compare_reports(baseline_path: str, tuned_path: str):
"""Print side-by-side comparison of two evaluation reports."""
with open(baseline_path) as f:
baseline = json.load(f)
with open(tuned_path) as f:
tuned = json.load(f)
print(f"\n{'Benchmark':<12} {'Baseline':>10} {'Tuned':>10} {'Delta':>10} {'Status':>10}")
print("-" * 60)
for bench in baseline["benchmarks"]:
if bench not in tuned["benchmarks"]:
continue
b_score = baseline["benchmarks"][bench]["score"]
t_score = tuned["benchmarks"][bench]["score"]
delta = t_score - b_score
status = "PASS" if delta >= -0.05 else "REGRESS" if delta < 0 else "NEUTRAL"
print(f"{bench:<12} {b_score:>9.1%} {t_score:>9.1%} {delta:>+9.1%} {status:>10}")
print("-" * 60)
b_overall = baseline["overall_score"]
t_overall = tuned["overall_score"]
print(f"{'OVERALL':<12} {b_overall:>9.1%} {t_overall:>9.1%} {t_overall-b_overall:>+9.1%}")
print()
def main():
parser = argparse.ArgumentParser(description="Bee Evaluation Harness")
parser.add_argument("--model", default="HuggingFaceTB/SmolLM2-360M-Instruct", help="Model path or HF ID")
parser.add_argument("--device", default="mps" if torch.backends.mps.is_available() else "cpu", help="Device")
parser.add_argument("--output", default="./data/eval_reports/report.json", help="Output JSON path")
parser.add_argument("--benchmarks", nargs="+", default=None, help="Benchmarks to run (default: all)")
parser.add_argument("--compare", nargs=2, metavar=("BASELINE", "TUNED"), help="Compare two reports")
args = parser.parse_args()
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
)
if args.compare:
compare_reports(args.compare[0], args.compare[1])
return
report = run_all(args.model, args.device, args.output, args.benchmarks)
print(f"\nOverall Score: {report['overall_score']:.1%}")
for name, r in report["benchmarks"].items():
print(f" {name:<12}: {r['score']:>6.1%} ({r['passed']}/{r['total']})")
if __name__ == "__main__":
main()