def test_questions(): return [ { "task_id": "q4", "question": ( "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, " "but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it" ), "expected_keywords": ["clarify", "incomplete", "missing", "please provide", "need more information"] }, { "task_id": "q7", "question": ( "Given this table defining * on the set S = {a, b, c, d, e}\n\n" "|*|a|b|c|d|e|\n" "|---|---|---|---|---|---|\n" "|a|a|b|c|b|d|\n" "|b|b|c|a|e|c|\n" "|c|c|a|b|b|a|\n" "|d|b|e|b|e|d|\n" "|e|d|b|a|d|c|\n\n" "Provide the subset of S involved in any possible counter-examples that prove * is not commutative. " "Provide your answer as a comma-separated list of the elements in the set in alphabetical order." ), "expected_keywords": ["b, e"] }, { "task_id": "q3", "question": ( "'.rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI'" ), "expected_keywords": ["right"] }, { "task_id": "q2", "question": ( "How many studio albums did Mercedes Sosa release between 2000 and 2009 (inclusive)? " "Use Wikipedia to find the answer." ), "expected_keywords": ["3", "three"] }, { "task_id": "q4b", "question": ( "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?" ), "expected_keywords": ["FunkMonk"] }, { "task_id": "q5", "question": ( "Who is the CEO of OpenAI?" ), "expected_keywords": ["sam altman"] }, { "task_id": "q6", "question": ( "When was the Eiffel Tower built?" ), "expected_keywords": ["1889"] } ] def evaluate_agent(agent, questions): print("\n\n========= Running GAIA Evaluation =========\n") correct = 0 total = len(questions) logs = [] for q in questions: print(f"🟨 Q: {q['question']}") try: answer = agent(q["question"]) print(f"🟩 A: {answer}\n") matched = any(keyword.lower() in answer.lower() for keyword in q["expected_keywords"]) expected = ", ".join(q["expected_keywords"]) result = "✅ Correct" if matched else "❌ Incorrect" print(f"{result} — Expected one of: [{expected}]\n") if matched: correct += 1 except Exception as e: print(f"🟥 Error: {e}\n") result = f"🟥 Error: {e}" logs.append({ "question": q["question"], "expected_keywords": q["expected_keywords"], "result": result }) score_report = f"✅ Score: {correct} / {total} correct" print(score_report + "\n") return score_report