Final_Assignment / tests /monitor_tests.py
GAIA Developer
πŸ§ͺ Add comprehensive test infrastructure and async testing system
c262d1a
#!/usr/bin/env python3
"""
Monitor GAIA test progress and provide real-time status updates
"""
import os
import time
import json
from pathlib import Path
from datetime import datetime
import argparse
def get_latest_log_file():
"""Find the most recent classification test log file"""
log_dir = Path("logs")
if not log_dir.exists():
return None
log_files = list(log_dir.glob("classification_test_*.log"))
if not log_files:
return None
return max(log_files, key=lambda x: x.stat().st_mtime)
def parse_log_progress(log_file):
"""Parse log file to extract current progress"""
if not log_file or not log_file.exists():
return None
try:
with open(log_file, 'r') as f:
lines = f.readlines()
# Parse classification summary
classification_summary = {}
in_summary = False
# Parse testing progress
current_agent = None
questions_processed = 0
total_questions = 0
current_question = None
for line in lines:
line = line.strip()
# Classification summary section
if "CLASSIFICATION SUMMARY:" in line:
in_summary = True
continue
elif in_summary and ":" in line and "questions" in line:
parts = line.split(":")
if len(parts) == 2:
agent = parts[0].strip()
count_part = parts[1].strip()
if "(" in count_part:
count = int(count_part.split()[0])
classification_summary[agent] = count
elif in_summary and "Testing agent types:" in line:
in_summary = False
# Current testing progress
if "TESTING" in line and "AGENT" in line:
current_agent = line.split("TESTING")[1].split("AGENT")[0].strip()
elif "Questions to test:" in line:
total_questions = int(line.split(":")[-1].strip())
elif "Testing" in line and "/" in line and "]" in line:
# Extract current question number [X/Y]
bracket_part = line.split("[")[1].split("]")[0]
current_num = int(bracket_part.split("/")[0])
questions_processed = current_num - 1 # Since this is the one being processed
current_question = line.split("Testing")[1].split("...")[0].strip()
return {
'log_file': str(log_file),
'last_modified': datetime.fromtimestamp(log_file.stat().st_mtime),
'classification_summary': classification_summary,
'current_agent': current_agent,
'questions_processed': questions_processed,
'total_questions': total_questions,
'current_question': current_question,
'progress_percentage': (questions_processed / total_questions * 100) if total_questions > 0 else 0
}
except Exception as e:
return {'error': str(e)}
def get_latest_results():
"""Get the latest test results file"""
result_files = list(Path(".").glob("gaia_classification_test_results_*.json"))
if not result_files:
return None
latest_file = max(result_files, key=lambda x: x.stat().st_mtime)
try:
with open(latest_file, 'r') as f:
data = json.load(f)
return {
'file': str(latest_file),
'metadata': data.get('test_metadata', {}),
'overall_stats': data.get('overall_stats', {}),
'agent_performance': data.get('agent_performance', {})
}
except:
return None
def display_status(progress, results, watch_mode=False):
"""Display current test status"""
if watch_mode:
# Clear screen in watch mode
os.system('clear' if os.name == 'posix' else 'cls')
print("πŸ” GAIA TEST MONITORING DASHBOARD")
print("=" * 60)
print(f"πŸ“… Last Updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
if progress and 'error' not in progress:
print(f"\nπŸ“Š CURRENT PROGRESS:")
print(f"πŸ—‚οΈ Log File: {Path(progress['log_file']).name}")
print(f"⏰ Last Modified: {progress['last_modified'].strftime('%H:%M:%S')}")
if progress['current_agent']:
print(f"\nπŸ€– Currently Testing: {progress['current_agent'].upper()} AGENT")
print(f"πŸ“ˆ Progress: {progress['questions_processed']}/{progress['total_questions']} ({progress['progress_percentage']:.1f}%)")
# Progress bar
bar_length = 30
filled_length = int(bar_length * progress['progress_percentage'] / 100)
bar = "β–ˆ" * filled_length + "β–‘" * (bar_length - filled_length)
print(f"β–“ Progress: [{bar}] {progress['progress_percentage']:.1f}%")
if progress['current_question']:
print(f"🧩 Current Question: {progress['current_question']}...")
if progress['classification_summary']:
print(f"\nπŸ“Š CLASSIFICATION BREAKDOWN:")
total_questions = sum(progress['classification_summary'].values())
for agent, count in sorted(progress['classification_summary'].items()):
percentage = (count / total_questions) * 100 if total_questions > 0 else 0
print(f" {agent}: {count} questions ({percentage:.1f}%)")
elif progress and 'error' in progress:
print(f"\n❌ ERROR reading log file: {progress['error']}")
else:
print(f"\n⚠️ No active test logs found")
if results:
print(f"\nπŸ“‹ LATEST COMPLETED RESULTS:")
print(f"πŸ“„ Results File: {Path(results['file']).name}")
overall = results.get('overall_stats', {})
if overall:
print(f"βœ… Success Rate: {overall.get('success_rate', 0):.1f}%")
print(f"πŸ“Š Total Questions: {overall.get('total_questions', 0)}")
print(f"βœ… Successful: {overall.get('successful', 0)}")
print(f"❌ Errors: {overall.get('errors', 0)}")
agent_perf = results.get('agent_performance', {})
if agent_perf:
print(f"\n🎯 AGENT PERFORMANCE:")
for agent, stats in sorted(agent_perf.items(), key=lambda x: x[1]['success_rate'], reverse=True):
success_rate = stats['success_rate']
status_emoji = "🟒" if success_rate >= 90 else "🟑" if success_rate >= 70 else "πŸ”΄"
print(f" {status_emoji} {agent}: {success_rate:.1f}% ({stats['successful']}/{stats['total_questions']})")
print(f"\nπŸ” MONITORING OPTIONS:")
print(f" Watch mode: python tests/monitor_tests.py --watch")
print(f" Analyze results: python tests/analyze_test_results.py <results_file>")
print(f" Run new test: python tests/test_by_classification.py --agent-types <type>")
def main():
"""Main monitoring interface"""
parser = argparse.ArgumentParser(description="Monitor GAIA test progress")
parser.add_argument('--watch', action='store_true', help='Watch mode (auto-refresh every 10s)')
parser.add_argument('--interval', type=int, default=10, help='Refresh interval in seconds for watch mode')
args = parser.parse_args()
if args.watch:
print("πŸ‘€ Starting watch mode... (Press Ctrl+C to stop)")
try:
while True:
progress = parse_log_progress(get_latest_log_file())
results = get_latest_results()
display_status(progress, results, watch_mode=True)
print(f"\n⏱️ Refreshing in {args.interval}s... (Ctrl+C to stop)")
time.sleep(args.interval)
except KeyboardInterrupt:
print(f"\nπŸ‘‹ Monitoring stopped.")
else:
progress = parse_log_progress(get_latest_log_file())
results = get_latest_results()
display_status(progress, results, watch_mode=False)
if __name__ == "__main__":
main()