Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Run LangSmith evaluation experiments for the Cashy agent. | |
| Usage: | |
| uv run python scripts/run_eval.py # Run experiment (default dataset + prefix) | |
| uv run python scripts/run_eval.py --prefix cashy-new-prompt # A/B test with custom prefix | |
| uv run python scripts/run_eval.py --dataset cashy-eval-v2.0 # Use specific dataset | |
| uv run python scripts/run_eval.py --upload # Upload eval cases to LangSmith | |
| uv run python scripts/run_eval.py --upload --file eval_cases/eval_cases_v1.json # Upload specific file | |
| """ | |
| import json | |
| import logging | |
| import argparse | |
| import sys | |
| from pathlib import Path | |
| # Add project root to path | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from dotenv import load_dotenv | |
| load_dotenv(Path(__file__).parent.parent / ".env") | |
| from langsmith.evaluation import evaluate | |
| from langsmith import Client | |
| from langchain_core.messages import HumanMessage | |
| from src.agent.graph import create_agent | |
| from scripts.evaluators import all_evaluators | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s [%(name)-12s] %(levelname)-7s %(message)s", | |
| datefmt="%H:%M:%S", | |
| ) | |
| for name in ("httpx", "httpcore", "urllib3", "hf_transfer"): | |
| logging.getLogger(name).setLevel(logging.WARNING) | |
| logger = logging.getLogger("cashy.eval") | |
| EVAL_FILE = Path(__file__).parent.parent / "eval_cases" / "eval_cases_v2.json" | |
| DEFAULT_DATASET = "cashy-eval-v2.0" | |
| DEFAULT_PREFIX = "cashy-baseline" | |
| def make_target(agent): | |
| """Create a target function that wraps the agent for langsmith evaluate().""" | |
| def run_agent(inputs: dict) -> dict: | |
| try: | |
| result = agent.invoke({"messages": [HumanMessage(content=inputs["input"])]}) | |
| response = result["messages"][-1].content | |
| tools_called = [] | |
| tool_args = [] | |
| for msg in result["messages"]: | |
| if hasattr(msg, "tool_calls") and msg.tool_calls: | |
| for tc in msg.tool_calls: | |
| tools_called.append(tc["name"]) | |
| tool_args.append(tc.get("args", {})) | |
| return { | |
| "response": response, | |
| "tools_called": tools_called, | |
| "tool_args": tool_args, | |
| "error": None, | |
| } | |
| except Exception as e: | |
| logger.error("Agent error: %s", e) | |
| return { | |
| "response": None, | |
| "tools_called": [], | |
| "tool_args": [], | |
| "error": str(e), | |
| } | |
| return run_agent | |
| def upload_to_langsmith(eval_data: dict): | |
| """Upload eval cases as a LangSmith dataset with enriched outputs.""" | |
| client = Client() | |
| version = eval_data["metadata"]["version"] | |
| dataset_name = f"cashy-eval-v{version}" | |
| try: | |
| dataset = client.create_dataset( | |
| dataset_name=dataset_name, | |
| description=eval_data["metadata"]["description"], | |
| ) | |
| logger.info("Created dataset: %s", dataset_name) | |
| except Exception: | |
| dataset = client.read_dataset(dataset_name=dataset_name) | |
| logger.info("Dataset already exists: %s", dataset_name) | |
| for case in eval_data["cases"]: | |
| client.create_example( | |
| inputs={"input": case["input"]}, | |
| outputs={ | |
| "expected_tools": case.get("expected_tools", []), | |
| "expected_output_contains": case.get("expected_output_contains", []), | |
| "expected_tool_args": case.get("expected_tool_args", {}), | |
| }, | |
| dataset_id=dataset.id, | |
| metadata={ | |
| "category": case.get("category"), | |
| "case_id": case["id"], | |
| "criteria": case.get("evaluation_criteria", []), | |
| }, | |
| ) | |
| logger.info("Uploaded %d examples to dataset '%s'", len(eval_data["cases"]), dataset_name) | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Run Cashy LangSmith evaluation") | |
| parser.add_argument("--dataset", default=DEFAULT_DATASET, help="LangSmith dataset name") | |
| parser.add_argument("--prefix", default=DEFAULT_PREFIX, help="Experiment prefix (for A/B naming)") | |
| parser.add_argument("--upload", action="store_true", help="Upload eval cases to LangSmith dataset") | |
| parser.add_argument("--file", default=str(EVAL_FILE), help="Local eval cases JSON file") | |
| args = parser.parse_args() | |
| if args.upload: | |
| eval_data = json.loads(Path(args.file).read_text()) | |
| logger.info("Loaded %d eval cases from %s", len(eval_data["cases"]), args.file) | |
| upload_to_langsmith(eval_data) | |
| return | |
| logger.info("Creating agent...") | |
| agent = create_agent() | |
| target = make_target(agent) | |
| logger.info("Running experiment '%s' on dataset '%s'...", args.prefix, args.dataset) | |
| results = evaluate( | |
| target, | |
| data=args.dataset, | |
| evaluators=all_evaluators, | |
| experiment_prefix=args.prefix, | |
| max_concurrency=0, | |
| ) | |
| print("\n" + "=" * 60) | |
| print(f"Experiment '{args.prefix}' complete.") | |
| print(f"View results in LangSmith: Datasets > {args.dataset} > Experiments") | |
| print("=" * 60) | |
| if __name__ == "__main__": | |
| main() | |