Spaces:

mabelwang21
/

Agents_Final_Assignment

Sleeping

App Files Files Community

Agents_Final_Assignment / test_agent.py

mabelwang21

quick check accuracy

3cc0589 10 months ago

raw

history blame contribute delete

3.51 kB

	import json
	from pathlib import Path
	from agent3 import MyAgent


	def test_agent(
	metadata_path: str = "metadata.jsonl",
	max_tests: int = 5,
	):
	"""
	Load up to max_tests questions from the GAIA metadata JSONL file,
	run them through MyAgent, and compare with the correct answer.
	"""
	try:
	agent = MyAgent()
	except Exception as e:
	print(f"Error initializing agent: {e}")
	return

	correct_count = 0
	total_count = 0

	metadata_file = Path(metadata_path)
	if not metadata_file.exists():
	print(f"Metadata file not found: {metadata_path}")
	return

	with open(metadata_file, "r", encoding="utf-8") as f:
	for i, line in enumerate(f):
	if i >= max_tests:
	break
	try:
	meta = json.loads(line)
	except json.JSONDecodeError:
	print(f"Invalid JSON on line {i+1}")
	continue

	task_id = meta.get("task_id") or meta.get("id") or ""
	question = meta.get("Question") or meta.get("text") or ""
	correct_answer = meta.get("Final answer") or meta.get("final answer") or meta.get("Answer") or ""

	print(f"--- Test {i+1}/{max_tests}: Task ID {task_id} ---")
	print(f"Question: {question}")

	if not question:
	print("Skipping: no question found\n")
	continue

	try:
	file_arg = meta.get("file_name")
	if file_arg:
	try:
	answer = agent.run(question, file_paths=[file_arg])
	except Exception as e:
	import traceback
	print(f"Error running agent with file: {e}")
	print(traceback.format_exc())
	continue
	else:
	try:
	answer = agent.run(question)
	except Exception as e:
	import traceback
	print(f"Error running agent: {e}")
	print(traceback.format_exc())
	continue
	print(f"Agent Answer: {answer}")
	print(f"Correct Answer: {correct_answer}")

	# Normalize for comparison
	def normalize(s):
	return str(s).strip().lower()

	if normalize(answer) == normalize(correct_answer):
	print("✅ MATCH\n")
	correct_count += 1
	else:
	print("❌ NO MATCH\n")
	total_count += 1

	except Exception as e:
	print(f"Error running agent on question '{question}': {e}\n")

	print(f"=== Final Results ===")
	print(f"Total Tests: {total_count}")
	print(f"Correct Answers: {correct_count}")
	if total_count > 0:
	print(f"Accuracy: {correct_count / total_count * 100:.2f}%")
	else:
	print("No valid tests run.")


	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(description="Test MyAgent with GAIA metadata.")
	parser.add_argument(
	"--metadata", type=str, default="metadata.jsonl",
	help="Path to GAIA metadata JSONL"
	)
	parser.add_argument(
	"--max", type=int, default=5,
	help="Maximum number of tests to run"
	)
	args = parser.parse_args()
	test_agent(args.metadata, args.max)