Final_Assignment

Sleeping

Final_Assignment / tests /monitor_tests.py

GAIA Developer

🧪 Add comprehensive test infrastructure and async testing system

c262d1a 5 months ago

8.16 kB

	#!/usr/bin/env python3
	"""
	Monitor GAIA test progress and provide real-time status updates
	"""

	import os
	import time
	import json
	from pathlib import Path
	from datetime import datetime
	import argparse

	def get_latest_log_file():
	"""Find the most recent classification test log file"""
	log_dir = Path("logs")
	if not log_dir.exists():
	return None

	log_files = list(log_dir.glob("classification_test_*.log"))
	if not log_files:
	return None

	return max(log_files, key=lambda x: x.stat().st_mtime)

	def parse_log_progress(log_file):
	"""Parse log file to extract current progress"""
	if not log_file or not log_file.exists():
	return None

	try:
	with open(log_file, 'r') as f:
	lines = f.readlines()

	# Parse classification summary
	classification_summary = {}
	in_summary = False

	# Parse testing progress
	current_agent = None
	questions_processed = 0
	total_questions = 0
	current_question = None

	for line in lines:
	line = line.strip()

	# Classification summary section
	if "CLASSIFICATION SUMMARY:" in line:
	in_summary = True
	continue
	elif in_summary and ":" in line and "questions" in line:
	parts = line.split(":")
	if len(parts) == 2:
	agent = parts[0].strip()
	count_part = parts[1].strip()
	if "(" in count_part:
	count = int(count_part.split()[0])
	classification_summary[agent] = count
	elif in_summary and "Testing agent types:" in line:
	in_summary = False

	# Current testing progress
	if "TESTING" in line and "AGENT" in line:
	current_agent = line.split("TESTING")[1].split("AGENT")[0].strip()
	elif "Questions to test:" in line:
	total_questions = int(line.split(":")[-1].strip())
	elif "Testing" in line and "/" in line and "]" in line:
	# Extract current question number [X/Y]
	bracket_part = line.split("[")[1].split("]")[0]
	current_num = int(bracket_part.split("/")[0])
	questions_processed = current_num - 1 # Since this is the one being processed
	current_question = line.split("Testing")[1].split("...")[0].strip()

	return {
	'log_file': str(log_file),
	'last_modified': datetime.fromtimestamp(log_file.stat().st_mtime),
	'classification_summary': classification_summary,
	'current_agent': current_agent,
	'questions_processed': questions_processed,
	'total_questions': total_questions,
	'current_question': current_question,
	'progress_percentage': (questions_processed / total_questions * 100) if total_questions > 0 else 0
	}

	except Exception as e:
	return {'error': str(e)}

	def get_latest_results():
	"""Get the latest test results file"""
	result_files = list(Path(".").glob("gaia_classification_test_results_*.json"))
	if not result_files:
	return None

	latest_file = max(result_files, key=lambda x: x.stat().st_mtime)

	try:
	with open(latest_file, 'r') as f:
	data = json.load(f)
	return {
	'file': str(latest_file),
	'metadata': data.get('test_metadata', {}),
	'overall_stats': data.get('overall_stats', {}),
	'agent_performance': data.get('agent_performance', {})
	}
	except:
	return None

	def display_status(progress, results, watch_mode=False):
	"""Display current test status"""

	if watch_mode:
	# Clear screen in watch mode
	os.system('clear' if os.name == 'posix' else 'cls')

	print("🔍 GAIA TEST MONITORING DASHBOARD")
	print("=" * 60)
	print(f"📅 Last Updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

	if progress and 'error' not in progress:
	print(f"\n📊 CURRENT PROGRESS:")
	print(f"🗂️ Log File: {Path(progress['log_file']).name}")
	print(f"⏰ Last Modified: {progress['last_modified'].strftime('%H:%M:%S')}")

	if progress['current_agent']:
	print(f"\n🤖 Currently Testing: {progress['current_agent'].upper()} AGENT")
	print(f"📈 Progress: {progress['questions_processed']}/{progress['total_questions']} ({progress['progress_percentage']:.1f}%)")

	# Progress bar
	bar_length = 30
	filled_length = int(bar_length * progress['progress_percentage'] / 100)
	bar = "█" * filled_length + "░" * (bar_length - filled_length)
	print(f"▓ Progress: [{bar}] {progress['progress_percentage']:.1f}%")

	if progress['current_question']:
	print(f"🧩 Current Question: {progress['current_question']}...")

	if progress['classification_summary']:
	print(f"\n📊 CLASSIFICATION BREAKDOWN:")
	total_questions = sum(progress['classification_summary'].values())
	for agent, count in sorted(progress['classification_summary'].items()):
	percentage = (count / total_questions) * 100 if total_questions > 0 else 0
	print(f" {agent}: {count} questions ({percentage:.1f}%)")

	elif progress and 'error' in progress:
	print(f"\n❌ ERROR reading log file: {progress['error']}")
	else:
	print(f"\n⚠️ No active test logs found")

	if results:
	print(f"\n📋 LATEST COMPLETED RESULTS:")
	print(f"📄 Results File: {Path(results['file']).name}")

	overall = results.get('overall_stats', {})
	if overall:
	print(f"✅ Success Rate: {overall.get('success_rate', 0):.1f}%")
	print(f"📊 Total Questions: {overall.get('total_questions', 0)}")
	print(f"✅ Successful: {overall.get('successful', 0)}")
	print(f"❌ Errors: {overall.get('errors', 0)}")

	agent_perf = results.get('agent_performance', {})
	if agent_perf:
	print(f"\n🎯 AGENT PERFORMANCE:")
	for agent, stats in sorted(agent_perf.items(), key=lambda x: x[1]['success_rate'], reverse=True):
	success_rate = stats['success_rate']
	status_emoji = "🟢" if success_rate >= 90 else "🟡" if success_rate >= 70 else "🔴"
	print(f" {status_emoji} {agent}: {success_rate:.1f}% ({stats['successful']}/{stats['total_questions']})")

	print(f"\n🔍 MONITORING OPTIONS:")
	print(f" Watch mode: python tests/monitor_tests.py --watch")
	print(f" Analyze results: python tests/analyze_test_results.py <results_file>")
	print(f" Run new test: python tests/test_by_classification.py --agent-types <type>")

	def main():
	"""Main monitoring interface"""
	parser = argparse.ArgumentParser(description="Monitor GAIA test progress")
	parser.add_argument('--watch', action='store_true', help='Watch mode (auto-refresh every 10s)')
	parser.add_argument('--interval', type=int, default=10, help='Refresh interval in seconds for watch mode')

	args = parser.parse_args()

	if args.watch:
	print("👀 Starting watch mode... (Press Ctrl+C to stop)")
	try:
	while True:
	progress = parse_log_progress(get_latest_log_file())
	results = get_latest_results()
	display_status(progress, results, watch_mode=True)
	print(f"\n⏱️ Refreshing in {args.interval}s... (Ctrl+C to stop)")
	time.sleep(args.interval)
	except KeyboardInterrupt:
	print(f"\n👋 Monitoring stopped.")
	else:
	progress = parse_log_progress(get_latest_log_file())
	results = get_latest_results()
	display_status(progress, results, watch_mode=False)

	if __name__ == "__main__":
	main()