|
|
import os |
|
|
import requests |
|
|
from datasets import load_dataset |
|
|
import sys |
|
|
import contextlib |
|
|
from io import StringIO |
|
|
|
|
|
|
|
|
from task_force import TaskForce |
|
|
from langgraph_agent import get_agent as get_langgraph_agent |
|
|
|
|
|
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" |
|
|
QUESTIONS_API = f"{DEFAULT_API_URL}/questions" |
|
|
|
|
|
|
|
|
dataset = load_dataset("gaia-benchmark/GAIA", '2023_level1', split="validation") |
|
|
|
|
|
|
|
|
def main(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
example = dataset[52] |
|
|
task_id = example["task_id"] |
|
|
question = example["Question"] |
|
|
expected_answer = example["Final answer"] |
|
|
|
|
|
print(f"\nπ§ Task ID: {task_id}") |
|
|
print(f"π Question:\n{question}\n") |
|
|
print(f"Expected Answer: {expected_answer}\n") |
|
|
|
|
|
print("βοΈ Loading agent...") |
|
|
try: |
|
|
agent = TaskForce() |
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Failed to initialize agent: {e}") |
|
|
return |
|
|
|
|
|
print("π€ Running agent on question...") |
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
answer = agent(question) |
|
|
|
|
|
print(f"\n=====================\nRESULTS\n=====================\n") |
|
|
if answer == expected_answer: |
|
|
print("Final Answer: ", answer) |
|
|
print(f"β
Agent successfully answered task {task_id}!") |
|
|
else: |
|
|
print(f"β Agent's answer for task {task_id} did not match expected answer.") |
|
|
print(f"Expected: {expected_answer}\nGot: {answer}") |
|
|
except Exception as e: |
|
|
print(f"β Agent failed on task {task_id}: {e}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |