simoncwang's picture
new attempt with better tools
c2b686e
import os
import requests
from datasets import load_dataset
import sys
import contextlib
from io import StringIO
# Import one of the agents (toggle as needed)
from task_force import TaskForce
from langgraph_agent import get_agent as get_langgraph_agent
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
QUESTIONS_API = f"{DEFAULT_API_URL}/questions"
# Load GAIA dataset (validation split)
dataset = load_dataset("gaia-benchmark/GAIA", '2023_level1', split="validation")
def main():
# print("πŸ” Fetching a single question from API...")
# try:
# response = requests.get(QUESTIONS_API, timeout=10)
# response.raise_for_status()
# questions = response.json()
# if not questions:
# print("❌ No questions returned from API.")
# return
# question = questions[0]
# except Exception as e:
# print(f"❌ Failed to fetch questions from API: {e}")
# return
# task_id = question.get("task_id")
# question_text = question.get("question")
# Get one example
example = dataset[52]
task_id = example["task_id"]
question = example["Question"]
expected_answer = example["Final answer"]
print(f"\n🧠 Task ID: {task_id}")
print(f"πŸ“Œ Question:\n{question}\n")
print(f"Expected Answer: {expected_answer}\n")
print("βš™οΈ Loading agent...")
try:
agent = TaskForce()
# print(agent.prompt_templates["managed_agent"])
# agent = get_langgraph_agent()
except Exception as e:
print(f"❌ Failed to initialize agent: {e}")
return
print("πŸ€– Running agent on question...")
try:
# Silence internal print statements (e.g. from planning loop)
# with contextlib.redirect_stdout(StringIO()):
# answer = agent(question)
answer = agent(question)
print(f"\n=====================\nRESULTS\n=====================\n")
if answer == expected_answer:
print("Final Answer: ", answer)
print(f"βœ… Agent successfully answered task {task_id}!")
else:
print(f"❌ Agent's answer for task {task_id} did not match expected answer.")
print(f"Expected: {expected_answer}\nGot: {answer}")
except Exception as e:
print(f"❌ Agent failed on task {task_id}: {e}")
if __name__ == "__main__":
main()