File size: 2,870 Bytes
4f6b4f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python3
"""
Test script to demonstrate the new correct answer functionality
"""

import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from agent_test_client import load_metadata, get_correct_answer, questions

def test_correct_answers():
    """Test the correct answer functionality"""
    print("πŸ§ͺ Testing Correct Answer Functionality")
    print("=" * 50)
    
    # Load metadata
    metadata = load_metadata()
    print(f"πŸ“Š Loaded {len(metadata)} correct answers from metadata.jsonl")
    print()
    
    # Test a few sample questions
    print("πŸ” Sample Questions with Correct Answers:")
    print("-" * 40)
    
    for i in range(min(5, len(questions))):  # Show first 5 questions
        question_data = questions[i]
        task_id = question_data.get("task_id", "Unknown")
        question_text = question_data.get("question", "No question")
        level = question_data.get("Level", "Unknown")
        
        # Get correct answer
        correct_answer = get_correct_answer(task_id)
        
        print(f"Question {i+1}:")
        print(f"  πŸ“‹ Task ID: {task_id}")
        print(f"  πŸ“Š Level: {level}")
        print(f"  ❓ Question: {question_text[:100]}{'...' if len(question_text) > 100 else ''}")
        print(f"  βœ… Correct Answer: {correct_answer if correct_answer else 'Not found'}")
        print()
    
    # Test specific questions by task_id
    print("🎯 Testing Specific Questions:")
    print("-" * 30)
    
    test_cases = [
        "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",  # Mercedes Sosa - should be "3"
        "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",  # YouTube video - should be "3"
        "2d83110e-a098-4ebb-9987-066c06fa42d0",  # Reversed text - should be "Right"
        "cca530fc-4052-43b2-b130-b30968d8aa44",  # Chess - should be "Rd5"
        "6f37996b-2ac7-44b0-8e68-6d28256631b4",  # Math table - should be "b, e"
    ]
    
    for task_id in test_cases:
        answer = get_correct_answer(task_id)
        print(f"πŸ“‹ {task_id[:8]}...: {answer if answer else 'Not found'}")
    
    print()
    print("πŸŽ‰ Correct answer functionality is working!")
    print()
    print("πŸ“ Summary of Changes Made:")
    print("  βœ… Added load_metadata() function to parse metadata.jsonl")
    print("  βœ… Added get_correct_answer() function to retrieve answers by task_id")
    print("  βœ… Modified test_evaluation_question() to return correct answer")
    print("  βœ… Modified test_all_evaluation_questions() to include correct answers")
    print("  βœ… Updated Gradio UI with additional Correct Answer box")
    print("  βœ… Updated DataFrame to show both Agent Answer and Correct Answer columns")
    print()
    print("πŸš€ The Evaluation Questions now show correct answers alongside agent responses!")

if __name__ == "__main__":
    test_correct_answers()