omnirouter-api / src /evaluation /run_evals.py
sumitrwk's picture
Upload 33 files
b534a53 verified
from langchain_core.messages import HumanMessage, ToolMessage, AIMessage
# Import your real compiled agent and your real judge
from src.agent.graph import app
from src.evaluation.judge import check_hallucination
def evaluate_real_agent(query: str):
print(f"\n==================================================")
print(f"πŸš€ RUNNING REAL AGENT EVALUATION")
print(f"Query: '{query}'")
print(f"==================================================")
# 1. Trigger the Real Agent
initial_state = {"messages": [HumanMessage(content=query)]}
config = {"configurable": {"thread_id": "automated_eval_run_1"}}
print("\nπŸ€– Agent is thinking and searching...")
# We use .invoke() here because we don't need streaming for a backend test
final_state = app.invoke(initial_state, config)
# 2. Extract the Dynamic Data from the State Machine's Memory
retrieved_context = ""
final_answer = ""
for msg in final_state["messages"]:
# Find the exact text the ChromaDB tool returned
if isinstance(msg, ToolMessage):
retrieved_context += msg.content + "\n"
# Find the final answer the Agent generated
elif isinstance(msg, AIMessage) and msg.content:
final_answer = msg.content
if not retrieved_context:
print("⚠️ Agent did not use the database. Cannot run Hallucination check.")
return
# 3. Pass the dynamic data to the Judge
result = check_hallucination(context=retrieved_context, answer=final_answer)
# 4. Print the final Evaluation Report
print(f"\nπŸ“Š EVALUATION REPORT")
print(f"Score: {result.score} / 1")
if result.score == 1:
print("βœ… PASS: Answer is completely grounded in the database.")
else:
print("❌ FAIL: Hallucination detected!")
print(f"Judge's Reasoning: {result.reasoning}")
print(f"==================================================\n")
if __name__ == "__main__":
# Test our agent with a real query!
evaluate_real_agent("What is OmniRouter and what does it do?")