Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Run comprehensive GAIA tests across all classification groups | |
This script orchestrates the complete testing workflow and analysis | |
""" | |
import subprocess | |
import time | |
import json | |
from pathlib import Path | |
from datetime import datetime | |
def run_command(command, description, timeout=1800): | |
"""Run a command with timeout and capture output""" | |
print(f"\nπ {description}") | |
print(f"Command: {command}") | |
print("-" * 60) | |
try: | |
result = subprocess.run( | |
command, | |
shell=True, | |
capture_output=True, | |
text=True, | |
timeout=timeout | |
) | |
if result.returncode == 0: | |
print("β SUCCESS") | |
print(f"Output: {result.stdout[:500]}...") | |
return True, result.stdout | |
else: | |
print("β FAILED") | |
print(f"Error: {result.stderr[:500]}...") | |
return False, result.stderr | |
except subprocess.TimeoutExpired: | |
print(f"β° TIMEOUT after {timeout}s") | |
return False, "Command timed out" | |
except Exception as e: | |
print(f"π₯ EXCEPTION: {e}") | |
return False, str(e) | |
def main(): | |
"""Run comprehensive testing workflow""" | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
print("π― COMPREHENSIVE GAIA TESTING WORKFLOW") | |
print("=" * 70) | |
print(f"Started: {datetime.now()}") | |
# Activate virtual environment prefix | |
venv_prefix = "source venv/bin/activate &&" | |
# Test plan - run each agent type separately for better error analysis | |
test_plan = [ | |
{ | |
"name": "Research Questions", | |
"command": f"{venv_prefix} python tests/test_by_classification.py --agent-types research", | |
"timeout": 1800, | |
"priority": "HIGH" | |
}, | |
{ | |
"name": "Multimedia Questions", | |
"command": f"{venv_prefix} python tests/test_by_classification.py --agent-types multimedia", | |
"timeout": 2400, | |
"priority": "HIGH" | |
}, | |
{ | |
"name": "Logic/Math Questions", | |
"command": f"{venv_prefix} python tests/test_by_classification.py --agent-types logic_math", | |
"timeout": 1200, | |
"priority": "MEDIUM" | |
}, | |
{ | |
"name": "File Processing Questions", | |
"command": f"{venv_prefix} python tests/test_by_classification.py --agent-types file_processing", | |
"timeout": 900, | |
"priority": "MEDIUM" | |
}, | |
{ | |
"name": "All Agent Types (Complete)", | |
"command": f"{venv_prefix} python tests/test_by_classification.py", | |
"timeout": 3600, | |
"priority": "LOW" | |
} | |
] | |
results = [] | |
# Execute test plan | |
for i, test in enumerate(test_plan, 1): | |
print(f"\n{'='*20} TEST {i}/{len(test_plan)} {'='*20}") | |
print(f"Name: {test['name']}") | |
print(f"Priority: {test['priority']}") | |
start_time = time.time() | |
success, output = run_command( | |
test['command'], | |
test['name'], | |
test['timeout'] | |
) | |
end_time = time.time() | |
result = { | |
'test_name': test['name'], | |
'command': test['command'], | |
'priority': test['priority'], | |
'success': success, | |
'duration': end_time - start_time, | |
'output_preview': output[:200] if output else "", | |
'timestamp': datetime.now().isoformat() | |
} | |
results.append(result) | |
# Brief pause between tests | |
time.sleep(5) | |
# Generate summary report | |
print(f"\nπ COMPREHENSIVE TEST SUMMARY") | |
print("=" * 70) | |
total_tests = len(test_plan) | |
successful_tests = len([r for r in results if r['success']]) | |
failed_tests = total_tests - successful_tests | |
print(f"Total Tests: {total_tests}") | |
print(f"Successful: {successful_tests} ({successful_tests/total_tests*100:.1f}%)") | |
print(f"Failed: {failed_tests} ({failed_tests/total_tests*100:.1f}%)") | |
print(f"\nπ DETAILED RESULTS:") | |
for result in results: | |
status = "β " if result['success'] else "β" | |
duration = result['duration'] | |
print(f" {status} {result['test_name']}: {duration:.1f}s ({result['priority']} priority)") | |
# Save comprehensive results | |
results_file = f"comprehensive_test_results_{timestamp}.json" | |
with open(results_file, 'w') as f: | |
json.dump({ | |
'metadata': { | |
'timestamp': timestamp, | |
'total_tests': total_tests, | |
'successful_tests': successful_tests, | |
'failed_tests': failed_tests, | |
'success_rate': successful_tests/total_tests*100 | |
}, | |
'test_results': results | |
}, f, indent=2) | |
print(f"\nπΎ Results saved to: {results_file}") | |
# Generate action items based on results | |
print(f"\nπ NEXT STEPS:") | |
high_priority_failures = [r for r in results if not r['success'] and r['priority'] == 'HIGH'] | |
if high_priority_failures: | |
print("π΄ HIGH PRIORITY FIXES NEEDED:") | |
for failure in high_priority_failures: | |
print(f" - Fix {failure['test_name']}") | |
print(f" Command: {failure['command']}") | |
medium_priority_failures = [r for r in results if not r['success'] and r['priority'] == 'MEDIUM'] | |
if medium_priority_failures: | |
print("π‘ MEDIUM PRIORITY IMPROVEMENTS:") | |
for failure in medium_priority_failures: | |
print(f" - Optimize {failure['test_name']}") | |
if successful_tests == total_tests: | |
print("π ALL TESTS PASSED! Ready for production use.") | |
print("π‘ Consider running specific error analysis on individual results files") | |
# Find the most recent results files for analysis | |
log_files = list(Path("logs").glob("classification_test_*.log")) | |
if log_files: | |
latest_log = max(log_files, key=lambda x: x.stat().st_mtime) | |
print(f"π Latest log file: {latest_log}") | |
result_files = list(Path(".").glob("gaia_classification_test_results_*.json")) | |
if result_files: | |
latest_results = max(result_files, key=lambda x: x.stat().st_mtime) | |
print(f"π Latest results: {latest_results}") | |
print(f"π Analyze with: python tests/analyze_test_results.py {latest_results}") | |
print(f"\nβ COMPREHENSIVE TESTING COMPLETE!") | |
print(f"Total Duration: {sum(r['duration'] for r in results):.1f}s") | |
if __name__ == "__main__": | |
main() |