def test_questions():
    return [
        {
            "task_id": "q4",
            "question": (
                "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, "
                "but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it"
            ),
            "expected_keywords": ["clarify", "incomplete", "missing", "please provide", "need more information"]
        },
        {
            "task_id": "q7",
            "question": (
                "Given this table defining * on the set S = {a, b, c, d, e}\n\n"
                "|*|a|b|c|d|e|\n"
                "|---|---|---|---|---|---|\n"
                "|a|a|b|c|b|d|\n"
                "|b|b|c|a|e|c|\n"
                "|c|c|a|b|b|a|\n"
                "|d|b|e|b|e|d|\n"
                "|e|d|b|a|d|c|\n\n"
                "Provide the subset of S involved in any possible counter-examples that prove * is not commutative. "
                "Provide your answer as a comma-separated list of the elements in the set in alphabetical order."
            ),
            "expected_keywords": ["b, e"]
        },
        {
            "task_id": "q3",
            "question": (
                "'.rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI'"
            ),
            "expected_keywords": ["right"]
        },
        {
            "task_id": "q2",
            "question": (
                "How many studio albums did Mercedes Sosa release between 2000 and 2009 (inclusive)? "
                "Use Wikipedia to find the answer."
            ),
            "expected_keywords": ["3", "three"]
        },
        {
            "task_id": "q4b",
            "question": (
                "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?"
            ),
            "expected_keywords": ["FunkMonk"]
        },
        {
            "task_id": "q5",
            "question": (
                "Who is the CEO of OpenAI?"
            ),
            "expected_keywords": ["sam altman"]
        },
        {
            "task_id": "q6",
            "question": (
                "When was the Eiffel Tower built?"
            ),
            "expected_keywords": ["1889"]
        }
    ]


def evaluate_agent(agent, questions):
    print("\n\n========= Running GAIA Evaluation =========\n")
    correct = 0
    total = len(questions)
    logs = []

    for q in questions:
        print(f"🟨 Q: {q['question']}")
        try:
            answer = agent(q["question"])
            print(f"🟩 A: {answer}\n")
            matched = any(keyword.lower() in answer.lower() for keyword in q["expected_keywords"])
            expected = ", ".join(q["expected_keywords"])
            result = "✅ Correct" if matched else "❌ Incorrect"
            print(f"{result} — Expected one of: [{expected}]\n")
            if matched:
                correct += 1
        except Exception as e:
            print(f"🟥 Error: {e}\n")
            result = f"🟥 Error: {e}"

        logs.append({
            "question": q["question"],
            "expected_keywords": q["expected_keywords"],
            "result": result
        })

    score_report = f"✅ Score: {correct} / {total} correct"
    print(score_report + "\n")
    return score_report