# file: 03_run_evaluation.py import os from dotenv import load_dotenv from langsmith import Client from langchain_openai import ChatOpenAI from agent_graph_factory import create_graph_app OPENROUTER_API_URL = "https://openrouter.ai/api/v1" def main(): """Main function to run A/B test evaluations on our agent.""" load_dotenv() client = Client() # Define the models we want to A/B test: Premium vs. Free models_to_test = [ "anthropic/claude-sonnet-4", "qwen/qwen3-coder" ] dataset_name = "word-count-golden-set" print(f"--- Running Evaluations on Dataset: {dataset_name} ---") for model_name in models_to_test: print(f"--- Testing Model: {model_name} ---") llm = ChatOpenAI( model=model_name, temperature=0, openai_api_key=os.getenv("OPENROUTER_API_KEY"), openai_api_base=OPENROUTER_API_URL, ) app = create_graph_app(llm) # Sanitize the model name for use as a project name sanitized_model_name = model_name.replace('/', '-').replace(':', '_') project_name = f"test-{sanitized_model_name}" client.run_on_dataset( dataset_name=dataset_name, llm_or_chain_factory=lambda: app, project_name=project_name, ) print(f"--- Test complete. Results in project: {project_name} ---") if __name__ == "__main__": main()