ai-codebase-analyst / source_code /03_run_evaluation.py
arizen-dev's picture
Initial project upload
90f65f7
Raw
History Blame Contribute Delete
1.44 kB
# file: 03_run_evaluation.py
import os
from dotenv import load_dotenv
from langsmith import Client
from langchain_openai import ChatOpenAI
from agent_graph_factory import create_graph_app
OPENROUTER_API_URL = "https://openrouter.ai/api/v1"
def main():
"""Main function to run A/B test evaluations on our agent."""
load_dotenv()
client = Client()
# Define the models we want to A/B test: Premium vs. Free
models_to_test = [
"anthropic/claude-sonnet-4",
"qwen/qwen3-coder"
]
dataset_name = "word-count-golden-set"
print(f"--- Running Evaluations on Dataset: {dataset_name} ---")
for model_name in models_to_test:
print(f"--- Testing Model: {model_name} ---")
llm = ChatOpenAI(
model=model_name,
temperature=0,
openai_api_key=os.getenv("OPENROUTER_API_KEY"),
openai_api_base=OPENROUTER_API_URL,
)
app = create_graph_app(llm)
# Sanitize the model name for use as a project name
sanitized_model_name = model_name.replace('/', '-').replace(':', '_')
project_name = f"test-{sanitized_model_name}"
client.run_on_dataset(
dataset_name=dataset_name,
llm_or_chain_factory=lambda: app,
project_name=project_name,
)
print(f"--- Test complete. Results in project: {project_name} ---")
if __name__ == "__main__":
main()