|
|
|
|
|
""" |
|
|
Real GAIA Questions Test for GAIA Agent System |
|
|
Tests the system with actual GAIA benchmark questions |
|
|
""" |
|
|
|
|
|
import json |
|
|
import os |
|
|
import sys |
|
|
import time |
|
|
from pathlib import Path |
|
|
from typing import Dict, List |
|
|
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent)) |
|
|
|
|
|
from agents.state import GAIAAgentState, QuestionType, AgentRole |
|
|
from agents.router import RouterAgent |
|
|
from agents.web_researcher import WebResearchAgent |
|
|
from agents.file_processor_agent import FileProcessorAgent |
|
|
from agents.reasoning_agent import ReasoningAgent |
|
|
from models.qwen_client import QwenClient |
|
|
|
|
|
def load_gaia_questions(file_path: str = "questions.json") -> List[Dict]: |
|
|
"""Load GAIA questions from JSON file""" |
|
|
try: |
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
questions = json.load(f) |
|
|
return questions |
|
|
except FileNotFoundError: |
|
|
print(f"β Questions file not found: {file_path}") |
|
|
return [] |
|
|
except json.JSONDecodeError as e: |
|
|
print(f"β Invalid JSON in questions file: {e}") |
|
|
return [] |
|
|
|
|
|
def classify_question_manually(question: str, file_name: str) -> Dict: |
|
|
"""Manually classify GAIA questions to compare with router""" |
|
|
|
|
|
question_lower = question.lower() |
|
|
|
|
|
|
|
|
if "wikipedia" in question_lower or "featured article" in question_lower: |
|
|
return {"type": "Wikipedia Research", "expected_agent": "web_researcher"} |
|
|
elif "youtube.com" in question or "youtu.be" in question: |
|
|
return {"type": "YouTube Analysis", "expected_agent": "web_researcher"} |
|
|
elif file_name and file_name.endswith(('.xlsx', '.csv')): |
|
|
return {"type": "Excel/CSV Processing", "expected_agent": "file_processor"} |
|
|
elif file_name and file_name.endswith('.py'): |
|
|
return {"type": "Python Code Analysis", "expected_agent": "file_processor"} |
|
|
elif file_name and file_name.endswith(('.mp3', '.wav')): |
|
|
return {"type": "Audio Processing", "expected_agent": "file_processor"} |
|
|
elif file_name and file_name.endswith(('.png', '.jpg', '.jpeg')): |
|
|
return {"type": "Image Analysis", "expected_agent": "file_processor"} |
|
|
elif any(word in question_lower for word in ['calculate', 'total', 'average', 'sum']): |
|
|
return {"type": "Mathematical Reasoning", "expected_agent": "reasoning_agent"} |
|
|
elif "reverse" in question_lower or "encode" in question_lower: |
|
|
return {"type": "Text Manipulation", "expected_agent": "reasoning_agent"} |
|
|
elif any(word in question_lower for word in ['athletes', 'competition', 'olympics']): |
|
|
return {"type": "Sports/Statistics Research", "expected_agent": "web_researcher"} |
|
|
else: |
|
|
return {"type": "General Research", "expected_agent": "web_researcher"} |
|
|
|
|
|
def test_real_gaia_questions(): |
|
|
"""Test system with real GAIA questions""" |
|
|
|
|
|
print("π§ͺ Real GAIA Questions Test") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
questions = load_gaia_questions("../questions.json") |
|
|
if not questions: |
|
|
print("β No questions loaded. Exiting.") |
|
|
return False |
|
|
|
|
|
print(f"π Loaded {len(questions)} GAIA questions") |
|
|
|
|
|
|
|
|
try: |
|
|
llm_client = QwenClient() |
|
|
router = RouterAgent(llm_client) |
|
|
web_agent = WebResearchAgent(llm_client) |
|
|
file_agent = FileProcessorAgent(llm_client) |
|
|
reasoning_agent = ReasoningAgent(llm_client) |
|
|
except Exception as e: |
|
|
print(f"β Failed to initialize system: {e}") |
|
|
return False |
|
|
|
|
|
|
|
|
test_questions = questions[:8] |
|
|
|
|
|
results = [] |
|
|
total_cost = 0.0 |
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
question_types = {} |
|
|
routing_accuracy = {"correct": 0, "total": 0} |
|
|
|
|
|
for i, q in enumerate(test_questions, 1): |
|
|
print(f"\nπ Question {i}/{len(test_questions)}") |
|
|
print(f" ID: {q['task_id']}") |
|
|
print(f" Level: {q['Level']}") |
|
|
print(f" File: {q['file_name'] if q['file_name'] else 'None'}") |
|
|
print(f" Question: {q['question'][:100]}...") |
|
|
|
|
|
|
|
|
manual_class = classify_question_manually(q['question'], q['file_name']) |
|
|
print(f" Expected Type: {manual_class['type']}") |
|
|
|
|
|
try: |
|
|
|
|
|
state = GAIAAgentState() |
|
|
state.task_id = q['task_id'] |
|
|
state.question = q['question'] |
|
|
state.difficulty_level = int(q['Level']) |
|
|
state.file_name = q['file_name'] if q['file_name'] else None |
|
|
if state.file_name: |
|
|
state.file_path = f"/tmp/{state.file_name}" |
|
|
|
|
|
|
|
|
routed_state = router.route_question(state) |
|
|
print(f" π§ Router: {routed_state.question_type.value} -> {[a.value for a in routed_state.selected_agents]}") |
|
|
print(f" π Complexity: {routed_state.complexity_assessment}") |
|
|
print(f" π° Est. Cost: ${routed_state.estimated_cost:.4f}") |
|
|
|
|
|
|
|
|
q_type = routed_state.question_type.value |
|
|
question_types[q_type] = question_types.get(q_type, 0) + 1 |
|
|
|
|
|
|
|
|
expected_agent = manual_class["expected_agent"] |
|
|
actual_agents = [a.value for a in routed_state.selected_agents] |
|
|
if expected_agent in actual_agents: |
|
|
routing_accuracy["correct"] += 1 |
|
|
routing_accuracy["total"] += 1 |
|
|
|
|
|
|
|
|
processed = False |
|
|
if AgentRole.WEB_RESEARCHER in routed_state.selected_agents: |
|
|
try: |
|
|
processed_state = web_agent.process(routed_state) |
|
|
processed = True |
|
|
except Exception as e: |
|
|
print(f" β οΈ Web researcher failed: {e}") |
|
|
|
|
|
elif AgentRole.REASONING_AGENT in routed_state.selected_agents: |
|
|
try: |
|
|
processed_state = reasoning_agent.process(routed_state) |
|
|
processed = True |
|
|
except Exception as e: |
|
|
print(f" β οΈ Reasoning agent failed: {e}") |
|
|
|
|
|
elif AgentRole.FILE_PROCESSOR in routed_state.selected_agents and not state.file_name: |
|
|
print(f" β οΈ File processor selected but no file provided") |
|
|
|
|
|
if processed: |
|
|
agent_result = list(processed_state.agent_results.values())[-1] |
|
|
cost = processed_state.total_cost |
|
|
processing_time = processed_state.total_processing_time |
|
|
|
|
|
print(f" β
Processed by: {agent_result.agent_role.value}") |
|
|
print(f" π Result: {agent_result.result[:150]}...") |
|
|
print(f" π Confidence: {agent_result.confidence:.2f}") |
|
|
print(f" π° Actual Cost: ${cost:.4f}") |
|
|
print(f" β±οΈ Time: {processing_time:.2f}s") |
|
|
|
|
|
total_cost += cost |
|
|
results.append({ |
|
|
"success": agent_result.success, |
|
|
"confidence": agent_result.confidence, |
|
|
"cost": cost, |
|
|
"time": processing_time |
|
|
}) |
|
|
else: |
|
|
print(f" π Routing only (no processing)") |
|
|
results.append({ |
|
|
"success": True, |
|
|
"confidence": 0.5, |
|
|
"cost": 0.0, |
|
|
"time": 0.0 |
|
|
}) |
|
|
|
|
|
except Exception as e: |
|
|
print(f" β Failed: {e}") |
|
|
results.append({ |
|
|
"success": False, |
|
|
"confidence": 0.0, |
|
|
"cost": 0.0, |
|
|
"time": 0.0 |
|
|
}) |
|
|
|
|
|
|
|
|
total_time = time.time() - start_time |
|
|
successful_results = [r for r in results if r["success"]] |
|
|
|
|
|
print("\n" + "=" * 50) |
|
|
print("π REAL GAIA TEST RESULTS") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
print(f"π― Questions Processed: {len(results)}") |
|
|
print(f"β
Successful Processing: {len(successful_results)}/{len(results)} ({len(successful_results)/len(results)*100:.1f}%)") |
|
|
print(f"π° Total Cost: ${total_cost:.4f}") |
|
|
print(f"β±οΈ Total Time: {total_time:.2f} seconds") |
|
|
|
|
|
if successful_results: |
|
|
avg_confidence = sum(r["confidence"] for r in successful_results) / len(successful_results) |
|
|
avg_cost = sum(r["cost"] for r in successful_results) / len(successful_results) |
|
|
avg_time = sum(r["time"] for r in successful_results) / len(successful_results) |
|
|
|
|
|
print(f"π Average Confidence: {avg_confidence:.2f}") |
|
|
print(f"π° Average Cost: ${avg_cost:.4f}") |
|
|
print(f"β‘ Average Time: {avg_time:.2f}s") |
|
|
|
|
|
|
|
|
print(f"\nπ Question Type Distribution:") |
|
|
for q_type, count in question_types.items(): |
|
|
print(f" {q_type}: {count}") |
|
|
|
|
|
|
|
|
routing_rate = routing_accuracy["correct"] / routing_accuracy["total"] * 100 if routing_accuracy["total"] > 0 else 0 |
|
|
print(f"\nπ§ Routing Accuracy: {routing_accuracy['correct']}/{routing_accuracy['total']} ({routing_rate:.1f}%)") |
|
|
|
|
|
|
|
|
monthly_budget = 0.10 |
|
|
if total_cost <= monthly_budget: |
|
|
remaining = monthly_budget - total_cost |
|
|
estimated_questions = int(remaining / (total_cost / len(results))) if total_cost > 0 else 1000 |
|
|
print(f"π° Budget Status: β
${remaining:.4f} remaining (~{estimated_questions} more questions)") |
|
|
else: |
|
|
print(f"π° Budget Status: β οΈ Over budget by ${total_cost - monthly_budget:.4f}") |
|
|
|
|
|
|
|
|
success_rate = len(successful_results) / len(results) * 100 |
|
|
if success_rate >= 80: |
|
|
print(f"\nπ EXCELLENT! System handles real GAIA questions well ({success_rate:.1f}% success)") |
|
|
return True |
|
|
elif success_rate >= 60: |
|
|
print(f"\nβ
GOOD! System shows promise ({success_rate:.1f}% success)") |
|
|
return True |
|
|
else: |
|
|
print(f"\nβ οΈ NEEDS WORK! Low success rate ({success_rate:.1f}%)") |
|
|
return False |
|
|
|
|
|
if __name__ == "__main__": |
|
|
success = test_real_gaia_questions() |
|
|
sys.exit(0 if success else 1) |