Final_Assignment / test_youtube_question.py
tonthatthienvu's picture
Clean repository without binary files
37cadfb
#!/usr/bin/env python3
"""
Test for YouTube question processing in GAIA system
"""
import os
import sys
import json
from pathlib import Path
import importlib
import asyncio
import re
# Import the module containing the YouTube video analysis tool
import gaia_tools
from main import GAIASolver, CodeAgent, GAIA_TOOLS
from question_classifier import QuestionClassifier
from async_complete_test_hf import HFAsyncGAIATestSystem
# Original analyze_youtube_video function
original_analyze_youtube_video = gaia_tools.analyze_youtube_video
# Create a mock analyze_youtube_video function
def mock_analyze_youtube_video(video_url, question, max_frames=10):
"""Mock implementation that returns a predefined answer for bird species question"""
print(f"πŸ“Ή Mock analyzing YouTube video: {video_url}")
# Clean the URL in case there's a trailing comma
cleaned_url = video_url.rstrip(',')
# For the specific URL in the GAIA task
if "L1vXCYZAYYM" in cleaned_url:
return """
**πŸŽ₯ Gemini 2.0 Flash Video+Audio Analysis**
**Title:** Bird Identification Challenge: Backyard Birds in Spring
**Duration:** 3:42
**File Size:** 45.2MB
**Question:** What is the highest number of bird species to be on camera simultaneously?
**Analysis Results:**
After careful frame-by-frame analysis of the video, the highest number of different bird species visible simultaneously is 3.
This occurs at approximately 1:23 into the video, where we can see:
1. American Robin
2. Northern Cardinal
3. Blue Jay
These three species are clearly visible in the same frame at this timestamp.
"""
# Generic response for other URLs
return """
**πŸŽ₯ Gemini 2.0 Flash Video+Audio Analysis**
**Title:** Unknown Video
**Duration:** Unknown
**File Size:** Unknown
**Question:** Unknown
**Analysis Results:**
Unable to analyze the video content. Please provide a valid YouTube URL.
"""
# YouTube URL regex pattern
YOUTUBE_URL_PATTERN = r'(https?://)?(www\.)?(youtube\.com|youtu\.?be)/.+?(?=\s|$)'
def extract_youtube_url(text):
"""Extract YouTube URL from text"""
match = re.search(YOUTUBE_URL_PATTERN, text)
if match:
return match.group(0)
return None
def direct_force_tools_execution(solver, youtube_url, question_text):
"""Directly execute the YouTube analysis tool via the solver's agent"""
# Create a direct prompt that forces the YouTube analysis
force_prompt = f"""
You need to analyze a YouTube video to answer a specific question.
YOUTUBE VIDEO URL: {youtube_url}
QUESTION: {question_text}
CRITICAL INSTRUCTIONS:
1. Use the analyze_youtube_video tool with the provided URL
2. Extract the answer from the tool's response
3. Provide ONLY the final numerical answer
"""
# Create a fresh agent using the same approach as in GAIASolver
print("πŸ€– Creating fresh agent for direct execution...")
agent = CodeAgent(
model=solver.model,
tools=GAIA_TOOLS,
max_steps=12,
verbosity_level=1 # Lower verbosity for cleaner output
)
# Run the agent with the forcing prompt
print("πŸ” Running direct analysis...")
response = agent.run(force_prompt)
return str(response)
def test_direct_youtube_question():
"""Test processing of YouTube question directly"""
# Create question with the YouTube URL
question = {
'task_id': 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6',
'Question': 'In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?',
'Final Answer': '3' # Assuming this is the correct answer based on GAIA metadata
}
# Replace the function in the module with our mock
print("πŸ”„ Replacing YouTube analysis tool with mock implementation...")
gaia_tools.analyze_youtube_video = mock_analyze_youtube_video
try:
# Initialize components after patching
solver = GAIASolver()
classifier = QuestionClassifier()
# Classify the question
print("🧩 Classifying question...")
classification = classifier.classify_question(question['Question'])
print(f"πŸ“‹ Classification: {classification['primary_agent']}")
print(f"πŸ”§ Tools needed: {classification.get('tools_needed', [])}")
# Extract YouTube URL from question
youtube_url = extract_youtube_url(question['Question'])
if youtube_url:
# Remove any trailing comma
youtube_url = youtube_url.rstrip(',')
print(f"πŸ”— Extracted YouTube URL: {youtube_url}")
# Use a direct approach to force tool execution
print("\n🧠 Processing question with direct YouTube analyzer execution...")
try:
direct_result = direct_force_tools_execution(
solver,
youtube_url,
"What is the highest number of bird species to be on camera simultaneously?"
)
print(f"\nπŸ” Direct result: {direct_result}")
except Exception as e:
print(f"\n⚠️ Direct test error: {e}")
direct_result = "Error in direct execution"
# Also try the normal processing path
print("\n🧠 Processing question with standard solver...")
try:
result = solver.solve_question(question)
print(f"\nβœ… Standard result: {result}")
except Exception as e:
print(f"\n⚠️ Standard test error: {e}")
result = "Error in standard execution"
# Validate result
expected = str(question['Final Answer']).strip().lower()
actual = str(result).strip().lower()
validation_status = "βœ“ correct" if expected == actual else "βœ— incorrect"
print(f"πŸ”Ž Validation: {validation_status}")
# If direct result contains the answer, check that too
if "3" in direct_result:
print(f"πŸ”Ž Direct validation: βœ“ correct")
else:
print(f"πŸ”Ž Direct validation: βœ— incorrect")
finally:
# Restore original function
print("πŸ”„ Restoring original YouTube analysis tool...")
gaia_tools.analyze_youtube_video = original_analyze_youtube_video
async def test_async_youtube_question():
"""Test processing of YouTube question using the async test system"""
# Replace the function in the module with our mock
print("πŸ”„ Replacing YouTube analysis tool with mock implementation in async test...")
gaia_tools.analyze_youtube_video = mock_analyze_youtube_video
try:
# Create async test system
system = HFAsyncGAIATestSystem(
max_concurrent=1,
timeout_seconds=60,
output_dir="/tmp/async_youtube_test"
)
# Create a single question test
questions = [
{
'task_id': 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6',
'Question': 'In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?',
'Final Answer': '3'
}
]
# Override the load_gaia_questions method to use our single question
async def mock_load_questions(*args, **kwargs):
return questions
# Save the original method and replace it
original_load_method = system.load_gaia_questions
system.load_gaia_questions = mock_load_questions
# Create a capturing wrapper for the solve_question method
# Instead of replacing the solve_question method, we'll just run the test
# Create a wrapper that ensures the mocking is active
async def solving_wrapper():
# Make extra sure the mock is in place during the test
gaia_tools.analyze_youtube_video = mock_analyze_youtube_video
# Print confirmation of active mock
print("πŸ“Ή Mock is active for async test - will analyze YouTube video")
# Just call our wrapper to set up the mock
await solving_wrapper()
# Run the test
print("πŸš€ Running async test with YouTube question...")
result = await system.run_comprehensive_test(question_limit=1)
# Print results
print("\nπŸ“Š Async Test Results:")
print(f"Total questions processed: {result['total_questions']}")
print(f"Status counts: {result['status_counts']}")
# Check answer from the first question
question_id = questions[0]['task_id']
if question_id in result['results']:
question_result = result['results'][question_id]
answer = question_result.get('answer', 'No answer')
validation = question_result.get('validation_status', 'unknown')
print(f"\nQuestion ID: {question_id}")
print(f"Answer: {answer}")
print(f"Validation: {validation}")
else:
print(f"No results found for question ID {question_id}")
# Restore the original method
system.load_gaia_questions = original_load_method
finally:
# Restore original function
print("πŸ”„ Restoring original YouTube analysis tool...")
gaia_tools.analyze_youtube_video = original_analyze_youtube_video
async def main():
"""Run both tests"""
print("πŸš€ Starting direct YouTube question test...")
test_direct_youtube_question()
print("\n\nπŸš€ Starting async YouTube question test...")
await test_async_youtube_question()
print("\nβœ… All tests completed!")
if __name__ == "__main__":
asyncio.run(main())