Final_Assignment

Running

File size: 5,366 Bytes

c262d1a

#!/usr/bin/env python3
"""
Validate our multi-agent system answers against known GAIA results
"""

import json
import requests
from gaia_web_loader import GAIAQuestionLoaderWeb
from main import GAIASolver
from question_classifier import QuestionClassifier

# Known correct answers from GAIA validation (manually collected for testing)
KNOWN_ANSWERS = {
    "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8": {
        "question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
        "expected_answer": "FunkMonk",  # Need to verify this
        "our_answer": "JuraForm",
        "category": "research"
    },
    "2d83110e-a098-4ebb-9987-066c06fa42d0": {
        "question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
        "expected_answer": "right", 
        "our_answer": "right",
        "category": "logic_math"
    },
    "cca530fc-4052-43b2-b130-b30968d8aa44": {
        "question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
        "expected_answer": "Qxg2#",  # Need to verify with actual chess analysis
        "our_answer": "Qxg2#",
        "category": "multimedia"
    }
}

def validate_answer(question_id: str, our_answer: str, expected_answer: str) -> dict:
    """Validate our answer against the expected answer"""
    
    # Clean up answers for comparison
    our_clean = str(our_answer).strip().lower()
    expected_clean = str(expected_answer).strip().lower()
    
    # Exact match
    exact_match = our_clean == expected_clean
    
    # Contains match (for longer answers)
    contains_match = expected_clean in our_clean or our_clean in expected_clean
    
    # Similarity score (rough)
    similarity = len(set(our_clean.split()) & set(expected_clean.split())) / max(len(set(our_clean.split())), len(set(expected_clean.split())), 1)
    
    return {
        "exact_match": exact_match,
        "contains_match": contains_match,
        "similarity_score": similarity,
        "our_answer": our_answer,
        "expected_answer": expected_answer,
        "status": "CORRECT" if exact_match else "PARTIAL" if contains_match else "INCORRECT"
    }

def test_validation_system():
    """Test our validation system with known questions"""
    
    print("🧪 GAIA ANSWER VALIDATION SYSTEM")
    print("=" * 60)
    
    total_tests = len(KNOWN_ANSWERS)
    correct_count = 0
    partial_count = 0
    
    for question_id, data in KNOWN_ANSWERS.items():
        print(f"\n📝 Testing Question: {question_id[:8]}...")
        print(f"Category: {data['category']}")
        print(f"Question: {data['question'][:80]}...")
        
        # Validate our answer
        validation = validate_answer(
            question_id, 
            data['our_answer'], 
            data['expected_answer']
        )
        
        print(f"\n📊 VALIDATION RESULTS:")
        print(f"Our Answer: {validation['our_answer']}")
        print(f"Expected: {validation['expected_answer']}")
        print(f"Status: {validation['status']}")
        print(f"Exact Match: {validation['exact_match']}")
        print(f"Contains Match: {validation['contains_match']}")
        print(f"Similarity: {validation['similarity_score']:.2f}")
        
        if validation['status'] == "CORRECT":
            correct_count += 1
            print("✅ CORRECT!")
        elif validation['status'] == "PARTIAL":
            partial_count += 1
            print("🟡 PARTIAL MATCH")
        else:
            print("❌ INCORRECT")
    
    print(f"\n📋 OVERALL VALIDATION SUMMARY:")
    print("=" * 60)
    print(f"Total Questions Tested: {total_tests}")
    print(f"Correct Answers: {correct_count} ({correct_count/total_tests*100:.1f}%)")
    print(f"Partial Matches: {partial_count} ({partial_count/total_tests*100:.1f}%)")
    print(f"Incorrect: {total_tests - correct_count - partial_count}")
    print(f"Overall Success Rate: {(correct_count + partial_count)/total_tests*100:.1f}%")

def research_correct_answer():
    """Research the correct answer for the Wikipedia dinosaur question"""
    
    print("\n🔍 RESEARCHING CORRECT ANSWER FOR DINOSAUR QUESTION")
    print("=" * 60)
    
    question_id = "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8"
    
    print("Question: Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?")
    print("\n🕵️ Research Process:")
    print("1. Need to find Featured Articles promoted in November 2016")
    print("2. Identify which one was about a dinosaur") 
    print("3. Find the nominator")
    
    print("\n💡 Research Strategy:")
    print("- Check Wikipedia's Featured Article log for November 2016")
    print("- Look for dinosaur-related articles promoted that month")
    print("- Find nomination information")
    
    print(f"\n🤖 Our Answer: JuraForm")
    print(f"❓ Need to verify: Was this correct?")
    
    print(f"\n📚 Alternative Research Approach:")
    print("- Search for 'Spinosaurus' article on Wikipedia")
    print("- Check its promotion history")
    print("- Verify nomination details")

if __name__ == "__main__":
    test_validation_system()
    research_correct_answer()