Final_Assignment

Running

Final_Assignment / tests /validate_answers.py

GAIA Developer

🧪 Add comprehensive test infrastructure and async testing system

c262d1a about 1 month ago

5.37 kB

	#!/usr/bin/env python3
	"""
	Validate our multi-agent system answers against known GAIA results
	"""

	import json
	import requests
	from gaia_web_loader import GAIAQuestionLoaderWeb
	from main import GAIASolver
	from question_classifier import QuestionClassifier

	# Known correct answers from GAIA validation (manually collected for testing)
	KNOWN_ANSWERS = {
	"4fc2f1ae-8625-45b5-ab34-ad4433bc21f8": {
	"question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
	"expected_answer": "FunkMonk", # Need to verify this
	"our_answer": "JuraForm",
	"category": "research"
	},
	"2d83110e-a098-4ebb-9987-066c06fa42d0": {
	"question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
	"expected_answer": "right",
	"our_answer": "right",
	"category": "logic_math"
	},
	"cca530fc-4052-43b2-b130-b30968d8aa44": {
	"question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
	"expected_answer": "Qxg2#", # Need to verify with actual chess analysis
	"our_answer": "Qxg2#",
	"category": "multimedia"
	}
	}

	def validate_answer(question_id: str, our_answer: str, expected_answer: str) -> dict:
	"""Validate our answer against the expected answer"""

	# Clean up answers for comparison
	our_clean = str(our_answer).strip().lower()
	expected_clean = str(expected_answer).strip().lower()

	# Exact match
	exact_match = our_clean == expected_clean

	# Contains match (for longer answers)
	contains_match = expected_clean in our_clean or our_clean in expected_clean

	# Similarity score (rough)
	similarity = len(set(our_clean.split()) & set(expected_clean.split())) / max(len(set(our_clean.split())), len(set(expected_clean.split())), 1)

	return {
	"exact_match": exact_match,
	"contains_match": contains_match,
	"similarity_score": similarity,
	"our_answer": our_answer,
	"expected_answer": expected_answer,
	"status": "CORRECT" if exact_match else "PARTIAL" if contains_match else "INCORRECT"
	}

	def test_validation_system():
	"""Test our validation system with known questions"""

	print("🧪 GAIA ANSWER VALIDATION SYSTEM")
	print("=" * 60)

	total_tests = len(KNOWN_ANSWERS)
	correct_count = 0
	partial_count = 0

	for question_id, data in KNOWN_ANSWERS.items():
	print(f"\n📝 Testing Question: {question_id[:8]}...")
	print(f"Category: {data['category']}")
	print(f"Question: {data['question'][:80]}...")

	# Validate our answer
	validation = validate_answer(
	question_id,
	data['our_answer'],
	data['expected_answer']
	)

	print(f"\n📊 VALIDATION RESULTS:")
	print(f"Our Answer: {validation['our_answer']}")
	print(f"Expected: {validation['expected_answer']}")
	print(f"Status: {validation['status']}")
	print(f"Exact Match: {validation['exact_match']}")
	print(f"Contains Match: {validation['contains_match']}")
	print(f"Similarity: {validation['similarity_score']:.2f}")

	if validation['status'] == "CORRECT":
	correct_count += 1
	print("✅ CORRECT!")
	elif validation['status'] == "PARTIAL":
	partial_count += 1
	print("🟡 PARTIAL MATCH")
	else:
	print("❌ INCORRECT")

	print(f"\n📋 OVERALL VALIDATION SUMMARY:")
	print("=" * 60)
	print(f"Total Questions Tested: {total_tests}")
	print(f"Correct Answers: {correct_count} ({correct_count/total_tests*100:.1f}%)")
	print(f"Partial Matches: {partial_count} ({partial_count/total_tests*100:.1f}%)")
	print(f"Incorrect: {total_tests - correct_count - partial_count}")
	print(f"Overall Success Rate: {(correct_count + partial_count)/total_tests*100:.1f}%")

	def research_correct_answer():
	"""Research the correct answer for the Wikipedia dinosaur question"""

	print("\n🔍 RESEARCHING CORRECT ANSWER FOR DINOSAUR QUESTION")
	print("=" * 60)

	question_id = "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8"

	print("Question: Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?")
	print("\n🕵️ Research Process:")
	print("1. Need to find Featured Articles promoted in November 2016")
	print("2. Identify which one was about a dinosaur")
	print("3. Find the nominator")

	print("\n💡 Research Strategy:")
	print("- Check Wikipedia's Featured Article log for November 2016")
	print("- Look for dinosaur-related articles promoted that month")
	print("- Find nomination information")

	print(f"\n🤖 Our Answer: JuraForm")
	print(f"❓ Need to verify: Was this correct?")

	print(f"\n📚 Alternative Research Approach:")
	print("- Search for 'Spinosaurus' article on Wikipedia")
	print("- Check its promotion history")
	print("- Verify nomination details")

	if __name__ == "__main__":
	test_validation_system()
	research_correct_answer()