| | import requests |
| | from datasets import load_dataset |
| | from transformers import pipeline |
| |
|
| | |
| | |
| | |
| | SCORING_API = "https://agents-course-unit4-scoring.hf.space" |
| | MODEL_NAME = "google/flan-t5-base" |
| |
|
| | |
| | |
| | |
| | print("Loading model...") |
| | qa = pipeline("text2text-generation", model=MODEL_NAME, max_new_tokens=64) |
| |
|
| | |
| | |
| | |
| | print("Fetching GAIA questions...") |
| | questions = requests.get(f"{SCORING_API}/questions").json() |
| |
|
| | task_ids = [q["task_id"] for q in questions] |
| |
|
| | |
| | |
| | |
| | print("Loading GAIA validation set...") |
| | dataset = load_dataset( |
| | "gaia-benchmark/GAIA", |
| | "2023_level1", |
| | split="validation" |
| | ) |
| |
|
| | |
| | ground_truth = { |
| | item["task_id"]: item["Final answer"] |
| | for item in dataset |
| | if item["task_id"] in task_ids |
| | } |
| |
|
| | |
| | |
| | |
| | correct = 0 |
| |
|
| | for q in questions: |
| | task_id = q["task_id"] |
| | question = q["question"] |
| | true_answer = ground_truth.get(task_id, "").strip().lower() |
| |
|
| | model_output = qa(question)[0]["generated_text"].strip().lower() |
| |
|
| | match = model_output == true_answer |
| | correct += int(match) |
| |
|
| | print("\n" + "="*80) |
| | print(f"QUESTION:\n{question}") |
| | print(f"\nEXPECTED:\n{true_answer}") |
| | print(f"\nMODEL:\n{model_output}") |
| | print(f"\nMATCH: {'β
' if match else 'β'}") |
| |
|
| | print("\n" + "="*80) |
| | print(f"FINAL SCORE: {correct}/20") |
| |
|