Cashy / scripts /run_eval.py
GitHub Actions
Deploy to HF Spaces
17a78b5
#!/usr/bin/env python3
"""
Run LangSmith evaluation experiments for the Cashy agent.
Usage:
uv run python scripts/run_eval.py # Run experiment (default dataset + prefix)
uv run python scripts/run_eval.py --prefix cashy-new-prompt # A/B test with custom prefix
uv run python scripts/run_eval.py --dataset cashy-eval-v2.0 # Use specific dataset
uv run python scripts/run_eval.py --upload # Upload eval cases to LangSmith
uv run python scripts/run_eval.py --upload --file eval_cases/eval_cases_v1.json # Upload specific file
"""
import json
import logging
import argparse
import sys
from pathlib import Path
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from dotenv import load_dotenv
load_dotenv(Path(__file__).parent.parent / ".env")
from langsmith.evaluation import evaluate
from langsmith import Client
from langchain_core.messages import HumanMessage
from src.agent.graph import create_agent
from scripts.evaluators import all_evaluators
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(name)-12s] %(levelname)-7s %(message)s",
datefmt="%H:%M:%S",
)
for name in ("httpx", "httpcore", "urllib3", "hf_transfer"):
logging.getLogger(name).setLevel(logging.WARNING)
logger = logging.getLogger("cashy.eval")
EVAL_FILE = Path(__file__).parent.parent / "eval_cases" / "eval_cases_v2.json"
DEFAULT_DATASET = "cashy-eval-v2.0"
DEFAULT_PREFIX = "cashy-baseline"
def make_target(agent):
"""Create a target function that wraps the agent for langsmith evaluate()."""
def run_agent(inputs: dict) -> dict:
try:
result = agent.invoke({"messages": [HumanMessage(content=inputs["input"])]})
response = result["messages"][-1].content
tools_called = []
tool_args = []
for msg in result["messages"]:
if hasattr(msg, "tool_calls") and msg.tool_calls:
for tc in msg.tool_calls:
tools_called.append(tc["name"])
tool_args.append(tc.get("args", {}))
return {
"response": response,
"tools_called": tools_called,
"tool_args": tool_args,
"error": None,
}
except Exception as e:
logger.error("Agent error: %s", e)
return {
"response": None,
"tools_called": [],
"tool_args": [],
"error": str(e),
}
return run_agent
def upload_to_langsmith(eval_data: dict):
"""Upload eval cases as a LangSmith dataset with enriched outputs."""
client = Client()
version = eval_data["metadata"]["version"]
dataset_name = f"cashy-eval-v{version}"
try:
dataset = client.create_dataset(
dataset_name=dataset_name,
description=eval_data["metadata"]["description"],
)
logger.info("Created dataset: %s", dataset_name)
except Exception:
dataset = client.read_dataset(dataset_name=dataset_name)
logger.info("Dataset already exists: %s", dataset_name)
for case in eval_data["cases"]:
client.create_example(
inputs={"input": case["input"]},
outputs={
"expected_tools": case.get("expected_tools", []),
"expected_output_contains": case.get("expected_output_contains", []),
"expected_tool_args": case.get("expected_tool_args", {}),
},
dataset_id=dataset.id,
metadata={
"category": case.get("category"),
"case_id": case["id"],
"criteria": case.get("evaluation_criteria", []),
},
)
logger.info("Uploaded %d examples to dataset '%s'", len(eval_data["cases"]), dataset_name)
def main():
parser = argparse.ArgumentParser(description="Run Cashy LangSmith evaluation")
parser.add_argument("--dataset", default=DEFAULT_DATASET, help="LangSmith dataset name")
parser.add_argument("--prefix", default=DEFAULT_PREFIX, help="Experiment prefix (for A/B naming)")
parser.add_argument("--upload", action="store_true", help="Upload eval cases to LangSmith dataset")
parser.add_argument("--file", default=str(EVAL_FILE), help="Local eval cases JSON file")
args = parser.parse_args()
if args.upload:
eval_data = json.loads(Path(args.file).read_text())
logger.info("Loaded %d eval cases from %s", len(eval_data["cases"]), args.file)
upload_to_langsmith(eval_data)
return
logger.info("Creating agent...")
agent = create_agent()
target = make_target(agent)
logger.info("Running experiment '%s' on dataset '%s'...", args.prefix, args.dataset)
results = evaluate(
target,
data=args.dataset,
evaluators=all_evaluators,
experiment_prefix=args.prefix,
max_concurrency=0,
)
print("\n" + "=" * 60)
print(f"Experiment '{args.prefix}' complete.")
print(f"View results in LangSmith: Datasets > {args.dataset} > Experiments")
print("=" * 60)
if __name__ == "__main__":
main()