Spaces:
Sleeping
Sleeping
| import json | |
| from pathlib import Path | |
| from agent3 import MyAgent | |
| def test_agent( | |
| metadata_path: str = "metadata.jsonl", | |
| max_tests: int = 5, | |
| ): | |
| """ | |
| Load up to max_tests questions from the GAIA metadata JSONL file, | |
| run them through MyAgent, and compare with the correct answer. | |
| """ | |
| try: | |
| agent = MyAgent() | |
| except Exception as e: | |
| print(f"Error initializing agent: {e}") | |
| return | |
| correct_count = 0 | |
| total_count = 0 | |
| metadata_file = Path(metadata_path) | |
| if not metadata_file.exists(): | |
| print(f"Metadata file not found: {metadata_path}") | |
| return | |
| with open(metadata_file, "r", encoding="utf-8") as f: | |
| for i, line in enumerate(f): | |
| if i >= max_tests: | |
| break | |
| try: | |
| meta = json.loads(line) | |
| except json.JSONDecodeError: | |
| print(f"Invalid JSON on line {i+1}") | |
| continue | |
| task_id = meta.get("task_id") or meta.get("id") or "" | |
| question = meta.get("Question") or meta.get("text") or "" | |
| correct_answer = meta.get("Final answer") or meta.get("final answer") or meta.get("Answer") or "" | |
| print(f"--- Test {i+1}/{max_tests}: Task ID {task_id} ---") | |
| print(f"Question: {question}") | |
| if not question: | |
| print("Skipping: no question found\n") | |
| continue | |
| try: | |
| file_arg = meta.get("file_name") | |
| if file_arg: | |
| try: | |
| answer = agent.run(question, file_paths=[file_arg]) | |
| except Exception as e: | |
| import traceback | |
| print(f"Error running agent with file: {e}") | |
| print(traceback.format_exc()) | |
| continue | |
| else: | |
| try: | |
| answer = agent.run(question) | |
| except Exception as e: | |
| import traceback | |
| print(f"Error running agent: {e}") | |
| print(traceback.format_exc()) | |
| continue | |
| print(f"Agent Answer: {answer}") | |
| print(f"Correct Answer: {correct_answer}") | |
| # Normalize for comparison | |
| def normalize(s): | |
| return str(s).strip().lower() | |
| if normalize(answer) == normalize(correct_answer): | |
| print("✅ MATCH\n") | |
| correct_count += 1 | |
| else: | |
| print("❌ NO MATCH\n") | |
| total_count += 1 | |
| except Exception as e: | |
| print(f"Error running agent on question '{question}': {e}\n") | |
| print(f"=== Final Results ===") | |
| print(f"Total Tests: {total_count}") | |
| print(f"Correct Answers: {correct_count}") | |
| if total_count > 0: | |
| print(f"Accuracy: {correct_count / total_count * 100:.2f}%") | |
| else: | |
| print("No valid tests run.") | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Test MyAgent with GAIA metadata.") | |
| parser.add_argument( | |
| "--metadata", type=str, default="metadata.jsonl", | |
| help="Path to GAIA metadata JSONL" | |
| ) | |
| parser.add_argument( | |
| "--max", type=int, default=5, | |
| help="Maximum number of tests to run" | |
| ) | |
| args = parser.parse_args() | |
| test_agent(args.metadata, args.max) | |