#!/usr/bin/env python3 """ Test script to demonstrate the new correct answer functionality """ import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) from agent_test_client import load_metadata, get_correct_answer, questions def test_correct_answers(): """Test the correct answer functionality""" print("๐Ÿงช Testing Correct Answer Functionality") print("=" * 50) # Load metadata metadata = load_metadata() print(f"๐Ÿ“Š Loaded {len(metadata)} correct answers from metadata.jsonl") print() # Test a few sample questions print("๐Ÿ” Sample Questions with Correct Answers:") print("-" * 40) for i in range(min(5, len(questions))): # Show first 5 questions question_data = questions[i] task_id = question_data.get("task_id", "Unknown") question_text = question_data.get("question", "No question") level = question_data.get("Level", "Unknown") # Get correct answer correct_answer = get_correct_answer(task_id) print(f"Question {i+1}:") print(f" ๐Ÿ“‹ Task ID: {task_id}") print(f" ๐Ÿ“Š Level: {level}") print(f" โ“ Question: {question_text[:100]}{'...' if len(question_text) > 100 else ''}") print(f" โœ… Correct Answer: {correct_answer if correct_answer else 'Not found'}") print() # Test specific questions by task_id print("๐ŸŽฏ Testing Specific Questions:") print("-" * 30) test_cases = [ "8e867cd7-cff9-4e6c-867a-ff5ddc2550be", # Mercedes Sosa - should be "3" "a1e91b78-d3d8-4675-bb8d-62741b4b68a6", # YouTube video - should be "3" "2d83110e-a098-4ebb-9987-066c06fa42d0", # Reversed text - should be "Right" "cca530fc-4052-43b2-b130-b30968d8aa44", # Chess - should be "Rd5" "6f37996b-2ac7-44b0-8e68-6d28256631b4", # Math table - should be "b, e" ] for task_id in test_cases: answer = get_correct_answer(task_id) print(f"๐Ÿ“‹ {task_id[:8]}...: {answer if answer else 'Not found'}") print() print("๐ŸŽ‰ Correct answer functionality is working!") print() print("๐Ÿ“ Summary of Changes Made:") print(" โœ… Added load_metadata() function to parse metadata.jsonl") print(" โœ… Added get_correct_answer() function to retrieve answers by task_id") print(" โœ… Modified test_evaluation_question() to return correct answer") print(" โœ… Modified test_all_evaluation_questions() to include correct answers") print(" โœ… Updated Gradio UI with additional Correct Answer box") print(" โœ… Updated DataFrame to show both Agent Answer and Correct Answer columns") print() print("๐Ÿš€ The Evaluation Questions now show correct answers alongside agent responses!") if __name__ == "__main__": test_correct_answers()