Spaces:
Sleeping
Sleeping
File size: 2,870 Bytes
4f6b4f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
#!/usr/bin/env python3
"""
Test script to demonstrate the new correct answer functionality
"""
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from agent_test_client import load_metadata, get_correct_answer, questions
def test_correct_answers():
"""Test the correct answer functionality"""
print("π§ͺ Testing Correct Answer Functionality")
print("=" * 50)
# Load metadata
metadata = load_metadata()
print(f"π Loaded {len(metadata)} correct answers from metadata.jsonl")
print()
# Test a few sample questions
print("π Sample Questions with Correct Answers:")
print("-" * 40)
for i in range(min(5, len(questions))): # Show first 5 questions
question_data = questions[i]
task_id = question_data.get("task_id", "Unknown")
question_text = question_data.get("question", "No question")
level = question_data.get("Level", "Unknown")
# Get correct answer
correct_answer = get_correct_answer(task_id)
print(f"Question {i+1}:")
print(f" π Task ID: {task_id}")
print(f" π Level: {level}")
print(f" β Question: {question_text[:100]}{'...' if len(question_text) > 100 else ''}")
print(f" β
Correct Answer: {correct_answer if correct_answer else 'Not found'}")
print()
# Test specific questions by task_id
print("π― Testing Specific Questions:")
print("-" * 30)
test_cases = [
"8e867cd7-cff9-4e6c-867a-ff5ddc2550be", # Mercedes Sosa - should be "3"
"a1e91b78-d3d8-4675-bb8d-62741b4b68a6", # YouTube video - should be "3"
"2d83110e-a098-4ebb-9987-066c06fa42d0", # Reversed text - should be "Right"
"cca530fc-4052-43b2-b130-b30968d8aa44", # Chess - should be "Rd5"
"6f37996b-2ac7-44b0-8e68-6d28256631b4", # Math table - should be "b, e"
]
for task_id in test_cases:
answer = get_correct_answer(task_id)
print(f"π {task_id[:8]}...: {answer if answer else 'Not found'}")
print()
print("π Correct answer functionality is working!")
print()
print("π Summary of Changes Made:")
print(" β
Added load_metadata() function to parse metadata.jsonl")
print(" β
Added get_correct_answer() function to retrieve answers by task_id")
print(" β
Modified test_evaluation_question() to return correct answer")
print(" β
Modified test_all_evaluation_questions() to include correct answers")
print(" β
Updated Gradio UI with additional Correct Answer box")
print(" β
Updated DataFrame to show both Agent Answer and Correct Answer columns")
print()
print("π The Evaluation Questions now show correct answers alongside agent responses!")
if __name__ == "__main__":
test_correct_answers() |