Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Quick Clean Test - Test 5 representative questions without overrides | |
""" | |
import os | |
import sys | |
import json | |
import time | |
from pathlib import Path | |
from dotenv import load_dotenv | |
# Load environment variables | |
load_dotenv() | |
# Add parent directory to path for imports | |
sys.path.append(str(Path(__file__).parent.parent)) | |
# Local imports | |
from gaia_web_loader import GAIAQuestionLoaderWeb | |
from main import GAIASolver | |
from question_classifier import QuestionClassifier | |
def load_validation_answers(): | |
"""Load correct answers from GAIA validation metadata""" | |
answers = {} | |
try: | |
validation_path = Path(__file__).parent.parent / 'gaia_validation_metadata.jsonl' | |
with open(validation_path, 'r') as f: | |
for line in f: | |
if line.strip(): | |
data = json.loads(line.strip()) | |
task_id = data.get('task_id') | |
final_answer = data.get('Final answer') | |
if task_id and final_answer: | |
answers[task_id] = final_answer | |
except Exception as e: | |
print(f"β οΈ Could not load validation data: {e}") | |
return answers | |
def validate_answer(task_id: str, our_answer: str, validation_answers: dict): | |
"""Validate our answer against the correct answer""" | |
if task_id not in validation_answers: | |
return None | |
expected = str(validation_answers[task_id]).strip() | |
our_clean = str(our_answer).strip() | |
# Exact match | |
if our_clean.lower() == expected.lower(): | |
return {"status": "CORRECT", "expected": expected, "our": our_clean} | |
# Check if our answer contains the expected answer | |
if expected.lower() in our_clean.lower(): | |
return {"status": "PARTIAL", "expected": expected, "our": our_clean} | |
return {"status": "INCORRECT", "expected": expected, "our": our_clean} | |
def test_single_question(question_data, validation_answers, model="qwen3-235b"): | |
"""Test a single question without any overrides""" | |
task_id = question_data.get('task_id', 'unknown') | |
try: | |
print(f"π§ͺ [{task_id[:8]}...] Starting...") | |
# Initialize solver and classifier | |
solver = GAIASolver(use_kluster=True, kluster_model=model) | |
classifier = QuestionClassifier() | |
# Classify the question | |
question_text = question_data.get('question', '') | |
file_name = question_data.get('file_name', '') | |
classification = classifier.classify_question(question_text, file_name) | |
# Solve the question (NO OVERRIDES - pure LLM reasoning) | |
start_time = time.time() | |
answer = solver.solve_question(question_data) | |
end_time = time.time() | |
duration = end_time - start_time | |
# Validate answer | |
validation_result = validate_answer(task_id, answer, validation_answers) | |
result = { | |
'task_id': task_id, | |
'question_type': classification['primary_agent'], | |
'our_answer': str(answer), | |
'expected_answer': validation_result['expected'] if validation_result else 'N/A', | |
'status': validation_result['status'] if validation_result else 'NO_VALIDATION', | |
'duration': duration, | |
} | |
status_icon = "β " if result['status'] == "CORRECT" else "π‘" if result['status'] == "PARTIAL" else "β" | |
print(f"{status_icon} [{task_id[:8]}...] {result['status']} | {result['question_type']} | {duration:.1f}s") | |
print(f" Expected: {result['expected_answer']}") | |
print(f" Got: {result['our_answer']}") | |
return result | |
except Exception as e: | |
print(f"β [{task_id[:8]}...] ERROR: {str(e)}") | |
return { | |
'task_id': task_id, | |
'question_type': 'error', | |
'our_answer': '', | |
'expected_answer': validation_answers.get(task_id, 'N/A'), | |
'status': 'ERROR', | |
'duration': 0.0, | |
'error': str(e) | |
} | |
def run_quick_clean_test(): | |
"""Run quick clean test on 5 representative questions""" | |
print("π§ͺ QUICK CLEAN TEST - NO OVERRIDES") | |
print("=" * 50) | |
print("π― Testing 5 representative questions") | |
print("π« No hardcoded answers or overrides") | |
print("π€ Pure LLM + Tools reasoning only") | |
print() | |
# Load questions and validation data | |
loader = GAIAQuestionLoaderWeb() | |
all_questions = loader.questions | |
validation_answers = load_validation_answers() | |
# Select 5 representative questions across different types | |
test_question_ids = [ | |
"8e867cd7-cff9-4e6c-867a-ff5ddc2550be", # Research (Mercedes Sosa) | |
"a1e91b78-d3d8-4675-bb8d-62741b4b68a6", # Video Analysis (bird species) | |
"2d83110e-a098-4ebb-9987-066c06fa42d0", # Logic/Math (text reversal) | |
"cca530fc-4052-43b2-b130-b30968d8aa44", # Chess Analysis | |
"f918266a-b3e0-4914-865d-4faa564f1aef", # Python execution | |
] | |
test_questions = [] | |
for q in all_questions: | |
if q.get('task_id') in test_question_ids: | |
test_questions.append(q) | |
print(f"β Selected {len(test_questions)} test questions") | |
# Show questions | |
print(f"\nπ Test Questions:") | |
for i, q in enumerate(test_questions): | |
task_id = q.get('task_id', 'unknown') | |
question_preview = q.get('question', '')[:40] + "..." | |
expected = validation_answers.get(task_id, 'N/A') | |
print(f" {i+1}. {task_id[:8]}... β {expected}") | |
print(f" {question_preview}") | |
print(f"\nπ Starting quick clean test...") | |
# Process questions | |
start_time = time.time() | |
results = [] | |
for i, question_data in enumerate(test_questions): | |
print(f"\nπ Progress: {i+1}/{len(test_questions)}") | |
result = test_single_question(question_data, validation_answers) | |
results.append(result) | |
end_time = time.time() | |
total_duration = end_time - start_time | |
# Analyze results | |
print(f"\n" + "=" * 50) | |
print(f"π QUICK CLEAN TEST RESULTS") | |
print(f"=" * 50) | |
# Calculate metrics | |
total_questions = len(results) | |
correct_answers = len([r for r in results if r['status'] == 'CORRECT']) | |
partial_answers = len([r for r in results if r['status'] == 'PARTIAL']) | |
incorrect_answers = len([r for r in results if r['status'] == 'INCORRECT']) | |
errors = len([r for r in results if r['status'] == 'ERROR']) | |
accuracy_rate = correct_answers / total_questions * 100 | |
success_rate = (correct_answers + partial_answers) / total_questions * 100 | |
print(f"β±οΈ Total Duration: {int(total_duration // 60)}m {int(total_duration % 60)}s") | |
print(f"β **REAL ACCURACY: {accuracy_rate:.1f}%** ({correct_answers}/{total_questions})") | |
print(f"π― Success Rate: {success_rate:.1f}% (including partial)") | |
print(f"\nπ BREAKDOWN:") | |
print(f" β CORRECT: {correct_answers}") | |
print(f" π‘ PARTIAL: {partial_answers}") | |
print(f" β INCORRECT: {incorrect_answers}") | |
print(f" π₯ ERROR: {errors}") | |
# Question-by-question results | |
print(f"\nπ DETAILED RESULTS:") | |
for i, result in enumerate(results): | |
status_icon = "β " if result['status'] == "CORRECT" else "π‘" if result['status'] == "PARTIAL" else "β" | |
print(f" {i+1}. {status_icon} {result['question_type']:12} | {result['status']:9}") | |
print(f" Expected: {result['expected_answer']}") | |
print(f" Got: {result['our_answer']}") | |
if 'error' in result: | |
print(f" Error: {result['error']}") | |
# Final assessment | |
print(f"\nπ― HONEST ASSESSMENT:") | |
print(f"π« NO CHEATING - Pure LLM reasoning only") | |
print(f"π **Real System Accuracy: {accuracy_rate:.1f}%**") | |
if accuracy_rate >= 70: | |
print(f"π EXCELLENT: Achieves 70%+ target!") | |
elif accuracy_rate >= 50: | |
print(f"π§ GOOD: Solid performance, room for improvement") | |
elif accuracy_rate >= 30: | |
print(f"β οΈ MODERATE: Needs significant improvements") | |
else: | |
print(f"π¨ POOR: Requires major system overhaul") | |
return accuracy_rate, results | |
if __name__ == "__main__": | |
accuracy, results = run_quick_clean_test() | |
print(f"\nπ Quick clean test completed!") | |
print(f"π **REAL ACCURACY: {accuracy:.1f}%**") | |
print(f"π This is honest performance without any overrides!") |