File size: 5,366 Bytes
c262d1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
"""
Validate our multi-agent system answers against known GAIA results
"""

import json
import requests
from gaia_web_loader import GAIAQuestionLoaderWeb
from main import GAIASolver
from question_classifier import QuestionClassifier

# Known correct answers from GAIA validation (manually collected for testing)
KNOWN_ANSWERS = {
    "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8": {
        "question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
        "expected_answer": "FunkMonk",  # Need to verify this
        "our_answer": "JuraForm",
        "category": "research"
    },
    "2d83110e-a098-4ebb-9987-066c06fa42d0": {
        "question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
        "expected_answer": "right", 
        "our_answer": "right",
        "category": "logic_math"
    },
    "cca530fc-4052-43b2-b130-b30968d8aa44": {
        "question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
        "expected_answer": "Qxg2#",  # Need to verify with actual chess analysis
        "our_answer": "Qxg2#",
        "category": "multimedia"
    }
}

def validate_answer(question_id: str, our_answer: str, expected_answer: str) -> dict:
    """Validate our answer against the expected answer"""
    
    # Clean up answers for comparison
    our_clean = str(our_answer).strip().lower()
    expected_clean = str(expected_answer).strip().lower()
    
    # Exact match
    exact_match = our_clean == expected_clean
    
    # Contains match (for longer answers)
    contains_match = expected_clean in our_clean or our_clean in expected_clean
    
    # Similarity score (rough)
    similarity = len(set(our_clean.split()) & set(expected_clean.split())) / max(len(set(our_clean.split())), len(set(expected_clean.split())), 1)
    
    return {
        "exact_match": exact_match,
        "contains_match": contains_match,
        "similarity_score": similarity,
        "our_answer": our_answer,
        "expected_answer": expected_answer,
        "status": "CORRECT" if exact_match else "PARTIAL" if contains_match else "INCORRECT"
    }

def test_validation_system():
    """Test our validation system with known questions"""
    
    print("πŸ§ͺ GAIA ANSWER VALIDATION SYSTEM")
    print("=" * 60)
    
    total_tests = len(KNOWN_ANSWERS)
    correct_count = 0
    partial_count = 0
    
    for question_id, data in KNOWN_ANSWERS.items():
        print(f"\nπŸ“ Testing Question: {question_id[:8]}...")
        print(f"Category: {data['category']}")
        print(f"Question: {data['question'][:80]}...")
        
        # Validate our answer
        validation = validate_answer(
            question_id, 
            data['our_answer'], 
            data['expected_answer']
        )
        
        print(f"\nπŸ“Š VALIDATION RESULTS:")
        print(f"Our Answer: {validation['our_answer']}")
        print(f"Expected: {validation['expected_answer']}")
        print(f"Status: {validation['status']}")
        print(f"Exact Match: {validation['exact_match']}")
        print(f"Contains Match: {validation['contains_match']}")
        print(f"Similarity: {validation['similarity_score']:.2f}")
        
        if validation['status'] == "CORRECT":
            correct_count += 1
            print("βœ… CORRECT!")
        elif validation['status'] == "PARTIAL":
            partial_count += 1
            print("🟑 PARTIAL MATCH")
        else:
            print("❌ INCORRECT")
    
    print(f"\nπŸ“‹ OVERALL VALIDATION SUMMARY:")
    print("=" * 60)
    print(f"Total Questions Tested: {total_tests}")
    print(f"Correct Answers: {correct_count} ({correct_count/total_tests*100:.1f}%)")
    print(f"Partial Matches: {partial_count} ({partial_count/total_tests*100:.1f}%)")
    print(f"Incorrect: {total_tests - correct_count - partial_count}")
    print(f"Overall Success Rate: {(correct_count + partial_count)/total_tests*100:.1f}%")

def research_correct_answer():
    """Research the correct answer for the Wikipedia dinosaur question"""
    
    print("\nπŸ” RESEARCHING CORRECT ANSWER FOR DINOSAUR QUESTION")
    print("=" * 60)
    
    question_id = "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8"
    
    print("Question: Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?")
    print("\nπŸ•΅οΈ Research Process:")
    print("1. Need to find Featured Articles promoted in November 2016")
    print("2. Identify which one was about a dinosaur") 
    print("3. Find the nominator")
    
    print("\nπŸ’‘ Research Strategy:")
    print("- Check Wikipedia's Featured Article log for November 2016")
    print("- Look for dinosaur-related articles promoted that month")
    print("- Find nomination information")
    
    print(f"\nπŸ€– Our Answer: JuraForm")
    print(f"❓ Need to verify: Was this correct?")
    
    print(f"\nπŸ“š Alternative Research Approach:")
    print("- Search for 'Spinosaurus' article on Wikipedia")
    print("- Check its promotion history")
    print("- Verify nomination details")

if __name__ == "__main__":
    test_validation_system()
    research_correct_answer()