Final_Assignment / tests /validate_answers.py
GAIA Developer
πŸ§ͺ Add comprehensive test infrastructure and async testing system
c262d1a
#!/usr/bin/env python3
"""
Validate our multi-agent system answers against known GAIA results
"""
import json
import requests
from gaia_web_loader import GAIAQuestionLoaderWeb
from main import GAIASolver
from question_classifier import QuestionClassifier
# Known correct answers from GAIA validation (manually collected for testing)
KNOWN_ANSWERS = {
"4fc2f1ae-8625-45b5-ab34-ad4433bc21f8": {
"question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
"expected_answer": "FunkMonk", # Need to verify this
"our_answer": "JuraForm",
"category": "research"
},
"2d83110e-a098-4ebb-9987-066c06fa42d0": {
"question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
"expected_answer": "right",
"our_answer": "right",
"category": "logic_math"
},
"cca530fc-4052-43b2-b130-b30968d8aa44": {
"question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
"expected_answer": "Qxg2#", # Need to verify with actual chess analysis
"our_answer": "Qxg2#",
"category": "multimedia"
}
}
def validate_answer(question_id: str, our_answer: str, expected_answer: str) -> dict:
"""Validate our answer against the expected answer"""
# Clean up answers for comparison
our_clean = str(our_answer).strip().lower()
expected_clean = str(expected_answer).strip().lower()
# Exact match
exact_match = our_clean == expected_clean
# Contains match (for longer answers)
contains_match = expected_clean in our_clean or our_clean in expected_clean
# Similarity score (rough)
similarity = len(set(our_clean.split()) & set(expected_clean.split())) / max(len(set(our_clean.split())), len(set(expected_clean.split())), 1)
return {
"exact_match": exact_match,
"contains_match": contains_match,
"similarity_score": similarity,
"our_answer": our_answer,
"expected_answer": expected_answer,
"status": "CORRECT" if exact_match else "PARTIAL" if contains_match else "INCORRECT"
}
def test_validation_system():
"""Test our validation system with known questions"""
print("πŸ§ͺ GAIA ANSWER VALIDATION SYSTEM")
print("=" * 60)
total_tests = len(KNOWN_ANSWERS)
correct_count = 0
partial_count = 0
for question_id, data in KNOWN_ANSWERS.items():
print(f"\nπŸ“ Testing Question: {question_id[:8]}...")
print(f"Category: {data['category']}")
print(f"Question: {data['question'][:80]}...")
# Validate our answer
validation = validate_answer(
question_id,
data['our_answer'],
data['expected_answer']
)
print(f"\nπŸ“Š VALIDATION RESULTS:")
print(f"Our Answer: {validation['our_answer']}")
print(f"Expected: {validation['expected_answer']}")
print(f"Status: {validation['status']}")
print(f"Exact Match: {validation['exact_match']}")
print(f"Contains Match: {validation['contains_match']}")
print(f"Similarity: {validation['similarity_score']:.2f}")
if validation['status'] == "CORRECT":
correct_count += 1
print("βœ… CORRECT!")
elif validation['status'] == "PARTIAL":
partial_count += 1
print("🟑 PARTIAL MATCH")
else:
print("❌ INCORRECT")
print(f"\nπŸ“‹ OVERALL VALIDATION SUMMARY:")
print("=" * 60)
print(f"Total Questions Tested: {total_tests}")
print(f"Correct Answers: {correct_count} ({correct_count/total_tests*100:.1f}%)")
print(f"Partial Matches: {partial_count} ({partial_count/total_tests*100:.1f}%)")
print(f"Incorrect: {total_tests - correct_count - partial_count}")
print(f"Overall Success Rate: {(correct_count + partial_count)/total_tests*100:.1f}%")
def research_correct_answer():
"""Research the correct answer for the Wikipedia dinosaur question"""
print("\nπŸ” RESEARCHING CORRECT ANSWER FOR DINOSAUR QUESTION")
print("=" * 60)
question_id = "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8"
print("Question: Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?")
print("\nπŸ•΅οΈ Research Process:")
print("1. Need to find Featured Articles promoted in November 2016")
print("2. Identify which one was about a dinosaur")
print("3. Find the nominator")
print("\nπŸ’‘ Research Strategy:")
print("- Check Wikipedia's Featured Article log for November 2016")
print("- Look for dinosaur-related articles promoted that month")
print("- Find nomination information")
print(f"\nπŸ€– Our Answer: JuraForm")
print(f"❓ Need to verify: Was this correct?")
print(f"\nπŸ“š Alternative Research Approach:")
print("- Search for 'Spinosaurus' article on Wikipedia")
print("- Check its promotion history")
print("- Verify nomination details")
if __name__ == "__main__":
test_validation_system()
research_correct_answer()