pourNathann

Runtime error

App Files Files Community

MasterOfHugs commited on Sep 26

Commit

b6c4ba9

verified ·

1 Parent(s): 0a1558c

Update app.py

Browse files

Files changed (1) hide show

app.py +194 -192

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import re
 import json
@@ -5,209 +6,235 @@ import logging
 import requests
 import pandas as pd
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
-# --- Logging ---
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-MODEL_NAME = "bigscience/bloomz-1b1"
-# --- Load model & tokenizer (Causal LM for BLOOM) ---
 logger.info(f"Loading tokenizer and model: {MODEL_NAME} ...")
 try:
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    # Ensure pad token exists
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id
-    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
     logger.info("Model and tokenizer loaded successfully.")
 except Exception as e:
     logger.exception(f"Error loading model/tokenizer for '{MODEL_NAME}': {e}")
     raise
-# --- Dummy Tool Example (kept for compatibility) ---
-class AddTwoNumbers:
-    """Tool that adds two integers"""
     @staticmethod
-    def run(a: int, b: int) -> int:
-        return a + b
-tools_description = "Available tool: AddTwoNumbers.run(a, b)"
 # --- Reasoning Agent ---
 class ReasoningAgent:
     def __init__(self):
         self.tools_description = tools_description
-        logger.info("ReasoningAgent initialized.")
-        # Few-shot + strict instruction to try to get JSON-only output
         self.few_shot = (
-            "Example:\n"
-            "Question: What is 2 + 3?\n"
-            "Answer in JSON:\n"
-            '{\n'
-            '  "thought": "I will add 2 and 3 step by step",\n'
-            '  "action": "AddTwoNumbers.run(2, 3)",\n'
-            '  "observation": "5",\n'
-            '  "answer": "5"\n'
-            '}\n\n'
-            "Example:\n"
-            "Question: Who discovered X (unknown)?\n"
-            "Answer in JSON:\n"
-            '{\n'
-            '  "thought": "I do not know this fact",\n'
-            '  "action": "None",\n'
-            '  "observation": "",\n'
-            '  "answer": "I do not know."\n'
-            '}\n\n'
         )
-        self.instruction = (
-            "You are an AI reasoning agent. "
-            "Available tool: AddTwoNumbers.run(a, b). "
-            "Answer the question and respond ONLY with a SINGLE valid JSON object (no explanatory text, no code). "
-            'Format exactly as: {"thought":..., "action":..., "observation":..., "answer":...}. '
-            'If you are unsure, set "answer": "I do not know."'
         )
-    def generate(self, prompt: str, max_new_tokens: int = 220) -> str:
-        """Generate text with the causal model, returning only generated suffix (not prompt)."""
-        inputs = tokenizer(prompt, return_tensors="pt")
-        input_len = inputs["input_ids"].shape[1]
-        try:
-            out = model.generate(
-                **inputs,
-                max_new_tokens=max_new_tokens,
-                do_sample=False,
-                # greedy generation by default; adjust if you want beams/sampling
-                pad_token_id=tokenizer.pad_token_id
-            )
-            # `out[0]` contains prompt + generated tokens for causal LM
-            full_decoded = tokenizer.decode(out[0], skip_special_tokens=True)
-            # Try to remove the prompt prefix from the decoded string to get only new text
-            prompt_decoded = tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
-            if full_decoded.startswith(prompt_decoded):
-                generated = full_decoded[len(prompt_decoded):].strip()
-            else:
-                # fallback heuristic: try slicing tokens
-                generated_tokens = out[0][input_len:]
-                if generated_tokens.nelement() == 0:
-                    generated = ""
-                else:
-                    generated = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
-            return generated
-        except Exception as e:
-            logger.exception("Generation error: %s", e)
-            raise
-    def extract_first_json(self, text: str):
-        """Extract the first JSON object found in text. Returns Python object or None."""
-        if text is None:
-            return None
-        # Regex to find first balanced-ish JSON object (handles simple nested objects)
         m = re.search(r"\{(?:[^{}]|\{[^{}]*\})*\}", text, re.DOTALL)
         if not m:
             return None
         json_text = m.group(0)
         try:
-            return json.loads(json_text)
         except json.JSONDecodeError:
-            # Try to fix common issues: replace single quotes -> double quotes, None->null, trailing commas
             fixed = json_text.replace("'", '"')
-            fixed = re.sub(r"\bNone\b", "null", fixed)
-            fixed = re.sub(r",\s*}", "}", fixed)
-            fixed = re.sub(r",\s*\]", "]", fixed)
             try:
-                return json.loads(fixed)
             except Exception:
-                logger.debug("Failed to decode JSON even after fixes. Raw: %s", json_text)
                 return None
     def __call__(self, question: str) -> str:
-        logger.info("\n=== Processing Question ===\n%s\n", question)
-        prompt = (
-            self.few_shot
-            + "\n\n"
-            + self.instruction
-            + f"\n\nQuestion: {question}\nAnswer in JSON:"
-        )
         try:
-            generated = self.generate(prompt, max_new_tokens=300)
-            # If generated is empty, try decoding entire output (fallback)
-            if not generated:
-                logger.info("Generated empty suffix, trying full decode fallback.")
-                generated = self.generate(prompt, max_new_tokens=300)
-            logger.info("=== Generated (raw) ===\n%s\n", generated[:4000])
         except Exception as e:
-            logger.exception("Generation failed: %s", e)
             return f"AGENT ERROR: Generation failed: {e}"
-        # Try to get first JSON object
-        parsed = self.extract_first_json(generated)
-        if parsed is None:
-            # If no JSON found, try to interpret simple plain answers (single token/number/word)
-            answer_guess = generated.strip().splitlines()[0] if generated.strip() else "I do not know."
-            parsed = {"thought": "", "action": "None", "observation": "", "answer": answer_guess}
-        # Normalize fields
-        thought = parsed.get("thought", "").strip() if isinstance(parsed.get("thought", ""), str) else ""
-        action = parsed.get("action", parsed.get("tool", "None")) or ""
-        observation = parsed.get("observation", "") or ""
-        answer = parsed.get("answer", "") or ""
-        # If action is a string invoking AddTwoNumbers, execute it
-        if isinstance(action, str) and action.strip().startswith("AddTwoNumbers"):
             try:
-                args_text = action[action.find("(")+1:action.find(")")]
-                args = [a.strip() for a in args_text.split(",") if a.strip() != ""]
-                if len(args) == 2:
-                    a_val = int(args[0])
-                    b_val = int(args[1])
-                    obs = AddTwoNumbers.run(a_val, b_val)
-                    observation = str(obs)
-                    # If answer is placeholder or empty, set to observation
-                    if not answer or str(answer).strip().lower() in ["", "none", "null", "i do not know."]:
-                        answer = str(obs)
-                    logger.info("✅ Executed tool: AddTwoNumbers.run(%s, %s) -> %s", a_val, b_val, obs)
-                else:
-                    observation = "TOOL ERROR: wrong number of args"
-                    logger.warning("Tool call had wrong number of arguments: %s", action)
             except Exception as e:
-                observation = f"TOOL ERROR: {e}"
-                logger.exception("Tool execution error: %s", e)
-        # Sanity checks and fallbacks
-        if isinstance(answer, str):
-            answer_str = answer.strip()
         else:
-            answer_str = str(answer)
-        # Heuristics to avoid returning the few-shot examples as the final answer
-        if self.few_shot.strip()[:30] in answer_str:
-            answer_str = "I do not know."
-        if not answer_str or answer_str.lower() in ["none", "null", "i do not know.", "i do not know"]:
-            answer_str = "I do not know."
         # Log internal state
         logger.info("💭 Thought: %s", thought)
         logger.info("🔧 Action: %s", action)
-        logger.info("👀 Observation: %s", observation)
-        logger.info("📝 Answer: %s", answer_str)
         logger.info("-" * 60)
-        return answer_str
-# --- Run & Submit function (keeps the same interface) ---
 def run_and_submit_all(profile: gr.OAuthProfile | None):
-    """
-    Fetch questions, run the agent on them, submit answers, and return status + results table.
-    """
     if profile:
         username = profile.username
         logger.info("User logged in: %s", username)
@@ -218,36 +245,27 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     questions_url = f"{DEFAULT_API_URL}/questions"
     submit_url = f"{DEFAULT_API_URL}/submit"
-    # 1. Fetch questions
-    logger.info("Fetching questions from: %s", questions_url)
     try:
         response = requests.get(questions_url, timeout=15)
         response.raise_for_status()
         questions_data = response.json()
-        if not isinstance(questions_data, list) or len(questions_data) == 0:
-            logger.warning("Fetched questions list is empty or invalid format.")
             return "Fetched questions list is empty or invalid format.", None
     except Exception as e:
-        logger.exception("Error fetching questions: %s", e)
         return f"Error fetching questions: {e}", None
-    # 2. Instantiate agent
-    try:
-        agent = ReasoningAgent()
-    except Exception as e:
-        logger.exception("Error instantiating agent: %s", e)
-        return f"Error initializing agent: {e}", None
-    # 3. Run agent on questions
     results_log = []
     answers_payload = []
-    logger.info("Running agent on %d questions...", len(questions_data))
     for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
         if not task_id or question_text is None:
-            logger.warning("Skipping item with missing task_id or question: %s", item)
             continue
         try:
             submitted_answer = agent(question_text)
@@ -258,7 +276,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
                 "Submitted Answer": submitted_answer
             })
         except Exception as e:
-            logger.exception("Error running agent on task %s: %s", task_id, e)
             results_log.append({
                 "Task ID": task_id,
                 "Question": question_text,
@@ -269,17 +287,17 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         logger.warning("Agent did not produce any answers to submit.")
         return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
-    # 4. Submit
     submission_data = {
         "username": username.strip(),
         "agent_code": f"https://huggingface.co/spaces/{os.getenv('SPACE_ID')}/tree/main",
         "answers": answers_payload
     }
-    logger.info("Submitting %d answers to: %s", len(answers_payload), submit_url)
     try:
-        response = requests.post(submit_url, json=submission_data, timeout=60)
-        response.raise_for_status()
-        result_data = response.json()
         final_status = (
             f"Submission Successful!\n"
             f"User: {result_data.get('username')}\n"
@@ -287,33 +305,32 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
             f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
             f"Message: {result_data.get('message', 'No message received.')}"
         )
-        logger.info("Submission successful.")
         results_df = pd.DataFrame(results_log)
         return final_status, results_df
     except requests.exceptions.HTTPError as e:
-        logger.exception("Submission HTTP error: %s", e)
         try:
-            err_json = e.response.json()
-            detail = err_json.get("detail", e.response.text)
         except Exception:
             detail = str(e)
-        status_message = f"Submission Failed: {detail}"
         results_df = pd.DataFrame(results_log)
-        return status_message, results_df
     except Exception as e:
-        logger.exception("Submission error: %s", e)
         results_df = pd.DataFrame(results_log)
         return f"Submission failed: {e}", results_df
 # --- Gradio Interface ---
 with gr.Blocks() as demo:
-    gr.Markdown("# Reasoning Agent Runner (BLOOMZ Causal LM)")
     gr.Markdown(
         """
         Instructions:
-        1. Login with Hugging Face (use the Login button).
         2. Click 'Run Evaluation & Submit All Answers'.
-        3. The agent will attempt step-by-step reasoning and submit answers.
         """
     )
     gr.LoginButton()
@@ -327,20 +344,5 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    print("\n" + "-"*30 + " App Starting " + "-"*30)
-    # Print environment hints
-    space_host_startup = os.getenv("SPACE_HOST")
-    space_id_startup = os.getenv("SPACE_ID")
-    if space_host_startup:
-        print(f"✅ SPACE_HOST found: {space_host_startup}")
-        print(f"   Runtime URL should be: https://{space_host_startup}.hf.space")
-    else:
-        print("ℹ️  SPACE_HOST environment variable not found (running locally?).")
-    if space_id_startup:
-        print(f"✅ SPACE_ID found: {space_id_startup}")
-        print(f"   Repo URL: https://huggingface.co/spaces/{space_id_startup}")
-    else:
-        print("ℹ️  SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
-    print("-"*(60 + len(" App Starting ")) + "\n")
     demo.launch(debug=True, share=False)

+# app.py
 import os
 import re
 import json
 import requests
 import pandas as pd
 import gradio as gr
+import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
+# --- Logging setup ---
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+# Change MODEL_NAME if you want a smaller / different causal model
+MODEL_NAME = os.getenv("MODEL_NAME", "bigscience/bloomz-1b1")
+# --- Load tokenizer & model (causal LM) ---
 logger.info(f"Loading tokenizer and model: {MODEL_NAME} ...")
 try:
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
+    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
+    # ensure pad_token_id set
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id
+    # move to device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
+    model.eval()
     logger.info("Model and tokenizer loaded successfully.")
 except Exception as e:
     logger.exception(f"Error loading model/tokenizer for '{MODEL_NAME}': {e}")
     raise
+# --- Simple Wikipedia search tool (synchronous, HTTP requests) ---
+class WikipediaTool:
+    """Simple helper to search Wikipedia and fetch page extracts."""
+    API_BASE = "https://en.wikipedia.org/w/api.php"
     @staticmethod
+    def search(query: str, limit: int = 3):
+        """Return a list of search results (title, snippet)."""
+        params = {
+            "action": "query",
+            "list": "search",
+            "srsearch": query,
+            "srlimit": limit,
+            "format": "json",
+        }
+        r = requests.get(WikipediaTool.API_BASE, params=params, timeout=10)
+        r.raise_for_status()
+        data = r.json()
+        results = []
+        for item in data.get("query", {}).get("search", []):
+            results.append({
+                "title": item.get("title"),
+                "snippet": re.sub("<.*?>", "", item.get("snippet", ""))  # strip HTML tags
+            })
+        return results
+    @staticmethod
+    def get_extract(title: str, chars: int = 800):
+        """Return the extract (plain text) for a Wikipedia page title."""
+        params = {
+            "action": "query",
+            "prop": "extracts",
+            "explaintext": True,
+            "exchars": chars,
+            "titles": title,
+            "format": "json",
+            "redirects": 1
+        }
+        r = requests.get(WikipediaTool.API_BASE, params=params, timeout=10)
+        r.raise_for_status()
+        data = r.json()
+        pages = data.get("query", {}).get("pages", {})
+        for pid, page in pages.items():
+            return {"title": page.get("title"), "extract": page.get("extract", "")}
+        return {"title": title, "extract": ""}
+# --- Tools description presented to the model ---
+tools_description = (
+    "Available tool: Wikipedia.search(query) -> returns a short list of titles+snippets.\n"
+    "               Wikipedia.get_extract(title) -> returns the page extract (plain text).\n"
+    "If you want the agent to use the web, call these tools by writing action like:\n"
+    "  Search: Wikipedia.search(\"query string\")\n"
+    "  Extract: Wikipedia.get_extract(\"Exact Page Title\")\n"
+    "If unsure or cannot answer from tools, set answer to \"I do not know.\""
+)
 # --- Reasoning Agent ---
 class ReasoningAgent:
     def __init__(self):
         self.tools_description = tools_description
+        # small few-shot just to show JSON format (kept minimal)
         self.few_shot = (
+            "Format example (ONLY RETURN a single JSON object):\n"
+            '{"thought":"...","action":"...","observation":"...","answer":"..."}\n'
+            "Action should be a single tool call or 'None'.\n"
         )
+        logger.info("ReasoningAgent initialized.")
+    def build_prompt(self, question: str) -> str:
+        # Keep prompt compact and explicit: produce ONLY one JSON object.
+        instruction = (
+            "You are an AI reasoning agent. Use the available tools if needed.\n"
+            + self.tools_description + "\n"
+            "Answer ONLY with a SINGLE valid JSON object (no extra text, no code). "
+            "Use exactly the keys: thought, action, observation, answer.\n"
+            "If you are going to call a tool, set action to the tool call as a single string; "
+            "if not using tools set action to \"None\". "
+            "If unsure, set answer to \"I do not know.\""
         )
+        prompt = f"{self.few_shot}\n{instruction}\n\nQuestion: {question}\nAnswer in JSON:"
+        return prompt
+    def parse_action(self, action_str: str):
+        """
+        Recognize actions of the form:
+          Wikipedia.search("query")
+          Wikipedia.get_extract("Title")
+        Returns a tuple (tool_name, arg) or (None, None).
+        """
+        if not isinstance(action_str, str):
+            return None, None
+        action_str = action_str.strip()
+        # search pattern Wikipedia.search("...")
+        m = re.match(r'Wikipedia\.search\(\s*["\'](.+?)["\']\s*\)\s*$', action_str)
+        if m:
+            return "search", m.group(1)
+        m2 = re.match(r'Wikipedia\.get_extract\(\s*["\'](.+?)["\']\s*\)\s*$', action_str)
+        if m2:
+            return "extract", m2.group(1)
+        return None, None
+    def extract_json(self, text: str):
+        # Try to find the first JSON object in the generated text
         m = re.search(r"\{(?:[^{}]|\{[^{}]*\})*\}", text, re.DOTALL)
         if not m:
             return None
         json_text = m.group(0)
         try:
+            parsed = json.loads(json_text)
+            return parsed
         except json.JSONDecodeError:
+            # try to fix common issues: single quotes -> double quotes
             fixed = json_text.replace("'", '"')
             try:
+                parsed = json.loads(fixed)
+                return parsed
             except Exception:
                 return None
     def __call__(self, question: str) -> str:
+        logger.info(f"\n=== Processing Question ===\n{question}\n")
+        prompt = self.build_prompt(question)
+        # Tokenize & generate
         try:
+            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+            out = model.generate(
+                **inputs,
+                max_new_tokens=220,
+                do_sample=False,
+                num_beams=3,
+                early_stopping=True,
+                pad_token_id=tokenizer.pad_token_id
+            )
+            generated = tokenizer.decode(out[0], skip_special_tokens=True).strip()
+            logger.info("=== Generated (raw) ===\n%s", generated[:2000])
         except Exception as e:
+            logger.exception("Generation error: %s", e)
             return f"AGENT ERROR: Generation failed: {e}"
+        # Extract JSON
+        parsed = self.extract_json(generated)
+        if not parsed:
+            # fallback: return "I do not know."
+            logger.warning("No valid JSON parsed from model output. Returning I do not know.")
+            return "I do not know."
+        # Ensure keys exist
+        thought = parsed.get("thought", "")
+        action = parsed.get("action", "None")
+        observation = parsed.get("observation", "")
+        answer = parsed.get("answer", "")
+        # If model asked to call Wikipedia tools, do it
+        tool_name, tool_arg = self.parse_action(action if action is not None else "")
+        if tool_name == "search":
             try:
+                results = WikipediaTool.search(tool_arg, limit=3)
+                observation = json.dumps(results, ensure_ascii=False)
+                # if answer empty, try to set it to a succinct message
+                if not answer or str(answer).strip() in ["", "I do not know.", "None"]:
+                    answer = f"Found {len(results)} wiki search results for '{tool_arg}'."
+                logger.info("✅ Executed tool: Wikipedia.search('%s') -> %d results", tool_arg, len(results))
             except Exception as e:
+                observation = f"Wikipedia search error: {e}"
+                logger.exception("Wikipedia search error")
+                answer = "I do not know."
+        elif tool_name == "extract":
+            try:
+                res = WikipediaTool.get_extract(tool_arg, chars=1500)
+                observation = json.dumps(res, ensure_ascii=False)
+                if not answer or str(answer).strip() in ["", "I do not know.", "None"]:
+                    answer = f"Extract fetched for '{res.get('title')}'."
+                logger.info("✅ Executed tool: Wikipedia.get_extract('%s')", tool_arg)
+            except Exception as e:
+                observation = f"Wikipedia extract error: {e}"
+                logger.exception("Wikipedia extract error")
+                answer = "I do not know."
         else:
+            # no tool or unrecognized action
+            logger.debug("No tool called or action unrecognized: %s", action)
+        # Final sanitization
+        if not answer or str(answer).strip() in ["", "None", "null"]:
+            answer = "I do not know."
         # Log internal state
         logger.info("💭 Thought: %s", thought)
         logger.info("🔧 Action: %s", action)
+        logger.info("👀 Observation: %s", observation if len(str(observation))<400 else str(observation)[:400]+"...")
+        logger.info("📝 Answer: %s", answer)
         logger.info("-" * 60)
+        # Return only the answer string for submission (same behavior as before)
+        return answer
+# --- Run & Submit ---
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     if profile:
         username = profile.username
         logger.info("User logged in: %s", username)
     questions_url = f"{DEFAULT_API_URL}/questions"
     submit_url = f"{DEFAULT_API_URL}/submit"
     try:
         response = requests.get(questions_url, timeout=15)
         response.raise_for_status()
         questions_data = response.json()
+        if not isinstance(questions_data, list):
+            logger.error("Unexpected questions_data format: %s", type(questions_data))
             return "Fetched questions list is empty or invalid format.", None
     except Exception as e:
+        logger.exception("Error fetching questions")
         return f"Error fetching questions: {e}", None
+    agent = ReasoningAgent()
     results_log = []
     answers_payload = []
+    logger.info("Running agent on %d questions...", len(questions_data))
     for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
         if not task_id or question_text is None:
+            logger.warning("Skipping invalid item: %s", item)
             continue
         try:
             submitted_answer = agent(question_text)
                 "Submitted Answer": submitted_answer
             })
         except Exception as e:
+            logger.exception("Agent run error on task %s: %s", task_id, e)
             results_log.append({
                 "Task ID": task_id,
                 "Question": question_text,
         logger.warning("Agent did not produce any answers to submit.")
         return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
     submission_data = {
         "username": username.strip(),
         "agent_code": f"https://huggingface.co/spaces/{os.getenv('SPACE_ID')}/tree/main",
         "answers": answers_payload
     }
+    logger.info("Submitting %d answers for user '%s' to %s ...", len(answers_payload), username, submit_url)
     try:
+        resp = requests.post(submit_url, json=submission_data, timeout=60)
+        resp.raise_for_status()
+        result_data = resp.json()
         final_status = (
             f"Submission Successful!\n"
             f"User: {result_data.get('username')}\n"
             f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
             f"Message: {result_data.get('message', 'No message received.')}"
         )
         results_df = pd.DataFrame(results_log)
+        logger.info("Submission succeeded.")
         return final_status, results_df
     except requests.exceptions.HTTPError as e:
+        logger.exception("Submission HTTP error")
         try:
+            detail = e.response.json()
         except Exception:
             detail = str(e)
         results_df = pd.DataFrame(results_log)
+        return f"Submission Failed: {detail}", results_df
     except Exception as e:
+        logger.exception("Submission error")
         results_df = pd.DataFrame(results_log)
         return f"Submission failed: {e}", results_df
 # --- Gradio Interface ---
 with gr.Blocks() as demo:
+    gr.Markdown("# Reasoning Agent Runner")
     gr.Markdown(
         """
         Instructions:
+        1. Login with Hugging Face.
         2. Click 'Run Evaluation & Submit All Answers'.
+        3. The agent can call Wikipedia.search(...) and Wikipedia.get_extract(...).
         """
     )
     gr.LoginButton()
     )
 if __name__ == "__main__":
+    logger.info("Starting Gradio app...")
     demo.launch(debug=True, share=False)