agentic-intent-classifier / evaluation /regression_suite.py
manikumargouni's picture
Upload folder using huggingface_hub
0584798 verified
from __future__ import annotations
import json
import sys
from pathlib import Path
BASE_DIR = Path(__file__).resolve().parent.parent
if str(BASE_DIR) not in sys.path:
sys.path.insert(0, str(BASE_DIR))
from combined_inference import classify_query
from schemas import validate_classify_response
def load_cases(path: Path) -> list[dict]:
return json.loads(path.read_text(encoding="utf-8"))
def write_json(path: Path, payload: dict | list) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
def resolve_path(payload: dict, dotted_path: str):
value = payload
for part in dotted_path.split("."):
if isinstance(value, dict):
value = value.get(part)
else:
return None
return value
def evaluate_case_file(cases_path: Path, output_dir: Path, artifact_name: str) -> dict:
cases = load_cases(cases_path)
results = []
counts_by_status: dict[str, dict[str, int]] = {}
for case in cases:
payload = validate_classify_response(classify_query(case["text"]))
mismatches = []
expected = case.get("expected", {})
actual_snapshot = {}
for dotted_path, expected_value in expected.items():
actual_value = resolve_path(payload, dotted_path)
actual_snapshot[dotted_path] = actual_value
if actual_value != expected_value:
mismatches.append(
{
"path": dotted_path,
"expected": expected_value,
"actual": actual_value,
}
)
status = case["status"]
bucket = counts_by_status.setdefault(status, {"total": 0, "passed": 0, "failed": 0})
bucket["total"] += 1
if mismatches:
bucket["failed"] += 1
else:
bucket["passed"] += 1
results.append(
{
"id": case["id"],
"status": status,
"text": case["text"],
"notes": case.get("notes", ""),
"pass": not mismatches,
"mismatches": mismatches,
"expected": expected,
"actual": actual_snapshot,
}
)
summary = {
"cases_path": str(cases_path),
"count": len(results),
"passed": sum(1 for item in results if item["pass"]),
"failed": sum(1 for item in results if not item["pass"]),
"by_status": counts_by_status,
"results": results,
}
write_json(output_dir / artifact_name, summary)
return summary
def evaluate_known_failure_cases(cases_path: Path, output_dir: Path) -> dict:
return evaluate_case_file(cases_path, output_dir, "known_failure_regression.json")
def evaluate_iab_behavior_lock_cases(cases_path: Path, output_dir: Path) -> dict:
return evaluate_case_file(cases_path, output_dir, "iab_behavior_lock_regression.json")
def evaluate_iab_cross_vertical_behavior_lock_cases(cases_path: Path, output_dir: Path) -> dict:
return evaluate_case_file(cases_path, output_dir, "iab_cross_vertical_behavior_lock_regression.json")
def evaluate_iab_quality_target_cases(cases_path: Path, output_dir: Path) -> dict:
return evaluate_case_file(cases_path, output_dir, "iab_quality_target_eval.json")
def evaluate_iab_cross_vertical_quality_target_cases(cases_path: Path, output_dir: Path) -> dict:
return evaluate_case_file(cases_path, output_dir, "iab_cross_vertical_quality_target_eval.json")