|
|
|
|
|
""" |
|
|
Complete Integration Test for GAIA Agent System |
|
|
Tests the full pipeline: Router -> Agents -> Tools -> Results |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import time |
|
|
import tempfile |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent)) |
|
|
|
|
|
from agents.state import GAIAAgentState, QuestionType, AgentRole |
|
|
from agents.router import RouterAgent |
|
|
from agents.web_researcher import WebResearchAgent |
|
|
from agents.file_processor_agent import FileProcessorAgent |
|
|
from agents.reasoning_agent import ReasoningAgent |
|
|
from models.qwen_client import QwenClient |
|
|
|
|
|
def test_complete_pipeline(): |
|
|
"""Test the complete GAIA agent pipeline""" |
|
|
|
|
|
print("π GAIA Complete Integration Test") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
try: |
|
|
llm_client = QwenClient() |
|
|
router = RouterAgent(llm_client) |
|
|
web_agent = WebResearchAgent(llm_client) |
|
|
file_agent = FileProcessorAgent(llm_client) |
|
|
reasoning_agent = ReasoningAgent(llm_client) |
|
|
except Exception as e: |
|
|
print(f"β Failed to initialize system: {e}") |
|
|
return False |
|
|
|
|
|
|
|
|
test_cases = [ |
|
|
{ |
|
|
"question": "What is the population of Paris?", |
|
|
"description": "Simple Wikipedia/web research question", |
|
|
"expected_agent": AgentRole.WEB_RESEARCHER |
|
|
}, |
|
|
{ |
|
|
"question": "Calculate the area of a circle with radius 5 meters", |
|
|
"description": "Mathematical reasoning with unit conversion", |
|
|
"expected_agent": AgentRole.REASONING_AGENT |
|
|
}, |
|
|
{ |
|
|
"question": "What is the average of these numbers: 10, 20, 30, 40, 50?", |
|
|
"description": "Statistical calculation", |
|
|
"expected_agent": AgentRole.REASONING_AGENT |
|
|
} |
|
|
] |
|
|
|
|
|
results = [] |
|
|
total_cost = 0.0 |
|
|
start_time = time.time() |
|
|
|
|
|
for i, test_case in enumerate(test_cases, 1): |
|
|
print(f"\nπ§ͺ Test {i}: {test_case['description']}") |
|
|
print(f" Question: {test_case['question']}") |
|
|
|
|
|
try: |
|
|
|
|
|
state = GAIAAgentState() |
|
|
state.task_id = f"test_{i}" |
|
|
state.question = test_case["question"] |
|
|
|
|
|
|
|
|
routed_state = router.route_question(state) |
|
|
print(f" β
Router: {routed_state.question_type.value} -> {[a.value for a in routed_state.selected_agents]}") |
|
|
|
|
|
|
|
|
if test_case["expected_agent"] in routed_state.selected_agents: |
|
|
if test_case["expected_agent"] == AgentRole.WEB_RESEARCHER: |
|
|
processed_state = web_agent.process(routed_state) |
|
|
elif test_case["expected_agent"] == AgentRole.REASONING_AGENT: |
|
|
processed_state = reasoning_agent.process(routed_state) |
|
|
elif test_case["expected_agent"] == AgentRole.FILE_PROCESSOR: |
|
|
processed_state = file_agent.process(routed_state) |
|
|
else: |
|
|
print(f" β οΈ Agent {test_case['expected_agent'].value} not implemented in test") |
|
|
continue |
|
|
|
|
|
|
|
|
if processed_state.agent_results: |
|
|
agent_result = list(processed_state.agent_results.values())[-1] |
|
|
success = agent_result.success |
|
|
confidence = agent_result.confidence |
|
|
cost = processed_state.total_cost |
|
|
processing_time = processed_state.total_processing_time |
|
|
|
|
|
print(f" β
Agent: {agent_result.agent_role.value}") |
|
|
print(f" β
Result: {agent_result.result[:100]}...") |
|
|
print(f" π Confidence: {confidence:.2f}") |
|
|
print(f" π° Cost: ${cost:.4f}") |
|
|
print(f" β±οΈ Time: {processing_time:.2f}s") |
|
|
|
|
|
total_cost += cost |
|
|
results.append(success) |
|
|
|
|
|
print(f" π― Overall: {'β
PASS' if success else 'β FAIL'}") |
|
|
else: |
|
|
print(f" β No agent results produced") |
|
|
results.append(False) |
|
|
else: |
|
|
print(f" β οΈ Expected agent {test_case['expected_agent'].value} not selected") |
|
|
results.append(False) |
|
|
|
|
|
except Exception as e: |
|
|
print(f" β Pipeline failed: {e}") |
|
|
results.append(False) |
|
|
|
|
|
|
|
|
print(f"\nπ§ͺ Test 4: File Processing with CSV") |
|
|
print(f" Description: Complete file analysis pipeline") |
|
|
|
|
|
try: |
|
|
with tempfile.TemporaryDirectory() as temp_dir: |
|
|
|
|
|
csv_path = os.path.join(temp_dir, "sales_data.csv") |
|
|
with open(csv_path, 'w') as f: |
|
|
f.write("product,sales,price\nWidget A,100,25.50\nWidget B,150,30.00\nWidget C,80,22.75") |
|
|
|
|
|
|
|
|
state = GAIAAgentState() |
|
|
state.task_id = "test_file" |
|
|
state.question = "What is the total sales value across all products?" |
|
|
state.file_name = "sales_data.csv" |
|
|
state.file_path = csv_path |
|
|
|
|
|
|
|
|
routed_state = router.route_question(state) |
|
|
processed_state = file_agent.process(routed_state) |
|
|
|
|
|
if processed_state.agent_results: |
|
|
agent_result = list(processed_state.agent_results.values())[-1] |
|
|
success = agent_result.success |
|
|
total_cost += processed_state.total_cost |
|
|
results.append(success) |
|
|
|
|
|
print(f" β
Router: {routed_state.question_type.value}") |
|
|
print(f" β
Agent: File processor") |
|
|
print(f" β
Result: {agent_result.result[:100]}...") |
|
|
print(f" π° Cost: ${processed_state.total_cost:.4f}") |
|
|
print(f" π― Overall: {'β
PASS' if success else 'β FAIL'}") |
|
|
else: |
|
|
print(f" β File processing failed") |
|
|
results.append(False) |
|
|
|
|
|
except Exception as e: |
|
|
print(f" β File test failed: {e}") |
|
|
results.append(False) |
|
|
|
|
|
|
|
|
total_time = time.time() - start_time |
|
|
passed = sum(results) |
|
|
total = len(results) |
|
|
pass_rate = (passed / total) * 100 |
|
|
|
|
|
print("\n" + "=" * 50) |
|
|
print("π COMPLETE INTEGRATION RESULTS") |
|
|
print("=" * 50) |
|
|
print(f"π― Tests Passed: {passed}/{total} ({pass_rate:.1f}%)") |
|
|
print(f"π° Total Cost: ${total_cost:.4f}") |
|
|
print(f"β±οΈ Total Time: {total_time:.2f} seconds") |
|
|
print(f"π Average Cost per Test: ${total_cost/total:.4f}") |
|
|
print(f"β‘ Average Time per Test: {total_time/total:.2f}s") |
|
|
|
|
|
|
|
|
monthly_budget = 0.10 |
|
|
if total_cost <= monthly_budget: |
|
|
remaining_budget = monthly_budget - total_cost |
|
|
estimated_questions = int(remaining_budget / (total_cost / total)) |
|
|
print(f"π° Budget Status: β
${remaining_budget:.4f} remaining (~{estimated_questions} more tests)") |
|
|
else: |
|
|
print(f"π° Budget Status: β οΈ Over budget by ${total_cost - monthly_budget:.4f}") |
|
|
|
|
|
|
|
|
if pass_rate >= 80 and total_cost <= 0.05: |
|
|
print("\nπ INTEGRATION SUCCESS! System ready for GAIA benchmark!") |
|
|
return True |
|
|
elif pass_rate >= 80: |
|
|
print("\nβ
FUNCTIONALITY SUCCESS! (Higher cost than ideal)") |
|
|
return True |
|
|
else: |
|
|
print("\nβ οΈ INTEGRATION ISSUES! Check individual test failures") |
|
|
return False |
|
|
|
|
|
if __name__ == "__main__": |
|
|
success = test_complete_pipeline() |
|
|
sys.exit(0 if success else 1) |