mattibuzzo13 commited on
Commit
464dcdc
·
verified ·
1 Parent(s): 05b3496

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +246 -78
app.py CHANGED
@@ -26,18 +26,47 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
26
 
27
  @tool
28
  def web_search(query: str) -> str:
29
- """Search the web using DuckDuckGo. Use for current events, facts, and general knowledge."""
30
  try:
31
  return DuckDuckGoSearchRun().run(query)
32
  except Exception as e:
33
  return f"Search error: {e}"
34
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  @tool
37
  def wikipedia_search(query: str) -> str:
38
- """Search Wikipedia for encyclopedic knowledge, historical facts, biographies, science."""
39
  try:
40
- wiki = WikipediaAPIWrapper(top_k_results=2, doc_content_chars_max=3000)
41
  return wiki.run(query)
42
  except Exception as e:
43
  return f"Wikipedia error: {e}"
@@ -46,9 +75,9 @@ def wikipedia_search(query: str) -> str:
46
  @tool
47
  def python_repl(code: str) -> str:
48
  """
49
- Execute Python code for math calculations, data processing, logic.
50
- Always use print() to show the result.
51
- Example: print(2 + 2)
52
  """
53
  import io, sys
54
  old_stdout = sys.stdout
@@ -67,8 +96,8 @@ def python_repl(code: str) -> str:
67
  @tool
68
  def calculator(expression: str) -> str:
69
  """
70
- Evaluate a simple math expression.
71
- Examples: '2 + 2', '100 * 1.07 ** 5', 'math.sqrt(144)'
72
  """
73
  try:
74
  return str(eval(expression, {"math": math, "__builtins__": {}}))
@@ -79,8 +108,9 @@ def calculator(expression: str) -> str:
79
  @tool
80
  def get_task_file(task_id: str) -> str:
81
  """
82
- Fetch the file attached to a GAIA task by its task_id.
83
- Use this when the question mentions an attached file or document.
 
84
  """
85
  try:
86
  import requests as req
@@ -100,23 +130,44 @@ def get_task_file(task_id: str) -> str:
100
  class AgentState(TypedDict):
101
  messages: Annotated[list[AnyMessage], add_messages]
102
 
103
- SYSTEM_PROMPT = """You are a precise expert AI solving GAIA benchmark questions.
104
-
105
- ## Answer Format (CRITICAL)
106
- - Give ONLY the bare answer: a number, word, name, date, or short phrase.
107
- - NO explanations, NO punctuation at the end, NO "The answer is...".
108
- - Correct examples: `42`, `Marie Curie`, `Paris`, `1969`, `blue`
109
- - For lists: `item1, item2, item3`
110
-
111
- ## Strategy
112
- 1. Read carefully — identify exactly what is asked.
113
- 2. Use tools to find and verify the answer.
114
- 3. Double-check calculations with calculator or python_repl.
115
- 4. If the question mentions a file or attachment, use get_task_file.
116
-
117
- ## Final Answer
118
- Always end with:
119
- FINAL ANSWER: <your answer here>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  """
121
 
122
  def _tool_to_openai_schema(t) -> dict:
@@ -146,11 +197,16 @@ class BasicAgent:
146
 
147
  # Mappa nome → funzione tool per esecuzione
148
  self.tools_by_name = {t.name: t for t in self.tools_list}
149
-
150
- # InferenceClient diretto — usa la Serverless Inference API HF
151
- self.client = InferenceClient(
152
- api_key=os.getenv("HF_TOKEN"),
153
- )
 
 
 
 
 
154
 
155
  # Schema OpenAI dei tool per passarli al client
156
  self.tools_schema = [_tool_to_openai_schema(t) for t in self.tools_list]
@@ -204,12 +260,12 @@ class BasicAgent:
204
  hf_messages = self._messages_to_hf_format([sys_msg] + state["messages"])
205
 
206
  response = self.client.chat_completion(
207
- model="Qwen/Qwen2.5-7B-Instruct",
208
  messages=hf_messages,
209
  tools=self.tools_schema,
210
  tool_choice="auto",
211
- max_tokens=100,
212
- temperature=0,
213
  )
214
 
215
  choice = response.choices[0].message
@@ -230,25 +286,134 @@ class BasicAgent:
230
  tool_calls=tool_calls,
231
  )
232
  return {"messages": [ai_message]}
233
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  def __call__(self, question: str) -> str:
235
- print(f"Agent received question (first 50 chars): {question[:50]}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  try:
237
- result = self.graph.invoke({
238
- "messages": [HumanMessage(content=question)]
239
- })
240
- last_message = result["messages"][-1].content
241
- print(f"Agent raw output: {last_message[:200]}...")
242
-
243
- # Estrai FINAL ANSWER se presente, altrimenti ultima riga
244
- match = re.search(r"FINAL ANSWER:\s*(.+?)(?:\n|$)", last_message, re.IGNORECASE)
245
- answer = match.group(1).strip() if match else last_message.strip().split("\n")[-1]
246
-
247
- print(f"Agent returning answer: {answer}")
248
- return answer
249
  except Exception as e:
250
- print(f"Agent error: {e}")
251
- return f"AGENT ERROR: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
  def run_and_submit_all( profile: gr.OAuthProfile | None):
254
  """
@@ -371,42 +536,44 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
371
  return status_message, results_df
372
 
373
 
374
- # --- Build Gradio Interface using Blocks ---
375
- with gr.Blocks() as demo:
376
- gr.Markdown("# Basic Agent Evaluation Runner")
377
- gr.Markdown(
378
- """
379
- **Instructions:**
380
 
381
- 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
382
- 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
383
- 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
384
 
385
- ---
386
- **Disclaimers:**
387
- Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
388
- This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
389
- """
390
- )
391
 
392
- gr.LoginButton()
393
 
394
- run_button = gr.Button("Run Evaluation & Submit All Answers")
395
 
396
- status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
397
- # Removed max_rows=10 from DataFrame constructor
398
- results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
399
 
400
- run_button.click(
401
- fn=run_and_submit_all,
402
- outputs=[status_output, results_table]
403
- )
 
 
404
 
405
  if __name__ == "__main__":
406
  print("\n" + "-"*30 + " App Starting " + "-"*30)
 
407
  # Check for SPACE_HOST and SPACE_ID at startup for information
408
  space_host_startup = os.getenv("SPACE_HOST")
409
- space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
410
 
411
  if space_host_startup:
412
  print(f"✅ SPACE_HOST found: {space_host_startup}")
@@ -414,7 +581,7 @@ if __name__ == "__main__":
414
  else:
415
  print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
416
 
417
- if space_id_startup: # Print repo URLs if SPACE_ID is found
418
  print(f"✅ SPACE_ID found: {space_id_startup}")
419
  print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
420
  print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
@@ -424,4 +591,5 @@ if __name__ == "__main__":
424
  print("-"*(60 + len(" App Starting ")) + "\n")
425
 
426
  print("Launching Gradio Interface for Basic Agent Evaluation...")
427
- demo.launch(debug=True, share=False)
 
 
26
 
27
  @tool
28
  def web_search(query: str) -> str:
29
+ """Search the web using DuckDuckGo for current facts, news, specific data, recent information, and verification. Returns top search results."""
30
  try:
31
  return DuckDuckGoSearchRun().run(query)
32
  except Exception as e:
33
  return f"Search error: {e}"
34
 
35
 
36
+ def _wikipedia_api_query(title: str) -> str:
37
+ """Get plain text extract from English Wikipedia for a page title."""
38
+ import urllib.parse
39
+ url = (
40
+ "https://en.wikipedia.org/w/api.php"
41
+ "?action=query&format=json&prop=extracts&explaintext=1&titles="
42
+ + urllib.parse.quote(title)
43
+ )
44
+ headers = {"User-Agent": "Mozilla/5.0 (compatible; AgentBot/1.0; +https://example.com/bot)"}
45
+ r = requests.get(url, timeout=15, headers=headers)
46
+ try:
47
+ r.raise_for_status()
48
+ except requests.exceptions.HTTPError as ee:
49
+ print(f"Warning: Wikipedia API HTTP error {ee}")
50
+ return ""
51
+ data = r.json()
52
+ pages = data.get("query", {}).get("pages", {})
53
+ if not pages:
54
+ return ""
55
+ text = next(iter(pages.values())).get("extract", "")
56
+ return text or ""
57
+
58
+
59
+ @tool
60
+ def wikipedia_api_query(title: str) -> str:
61
+ """Get plain text extract from English Wikipedia for a page title."""
62
+ return _wikipedia_api_query(title)
63
+
64
+
65
  @tool
66
  def wikipedia_search(query: str) -> str:
67
+ """Search Wikipedia for encyclopedic knowledge: historical facts, biographies, dates, definitions, figures, scientific information. Provides structured text summaries."""
68
  try:
69
+ wiki = WikipediaAPIWrapper(top_k_results=3, doc_content_chars_max=3000)
70
  return wiki.run(query)
71
  except Exception as e:
72
  return f"Wikipedia error: {e}"
 
75
  @tool
76
  def python_repl(code: str) -> str:
77
  """
78
+ Execute Python code for mathematical calculations, data processing, logic operations, and transformations.
79
+ Always use print() to output results.
80
+ Examples: print(2**10), print([1,2,3].count(2)), data=[1,2,3]; print(sum(data)/len(data))
81
  """
82
  import io, sys
83
  old_stdout = sys.stdout
 
96
  @tool
97
  def calculator(expression: str) -> str:
98
  """
99
+ Evaluate a mathematical expression quickly. Use for simple arithmetic and compound calculations.
100
+ Examples: '2 + 2', '100 * 1.07 ** 5', 'math.sqrt(144)', '(50 + 30) / 2'
101
  """
102
  try:
103
  return str(eval(expression, {"math": math, "__builtins__": {}}))
 
108
  @tool
109
  def get_task_file(task_id: str) -> str:
110
  """
111
+ Fetch the file or document attached to a GAIA task by its task_id.
112
+ Use this when the question mentions an attached file, document, PDF, or any attachment.
113
+ Returns text content for text/JSON files, or indicates binary file type.
114
  """
115
  try:
116
  import requests as req
 
130
  class AgentState(TypedDict):
131
  messages: Annotated[list[AnyMessage], add_messages]
132
 
133
+ SYSTEM_PROMPT = """You are a highly capable GAIA benchmark solver. Your goal is to answer questions accurately and precisely.
134
+
135
+ ## How to Solve Questions - Step by Step
136
+
137
+ 1. **Understand the Question**: Read carefully and identify:
138
+ - What type of answer is expected (number, text, list, date, etc.)
139
+ - Key constraints or special formats mentioned
140
+ - Whether a file or document is attached
141
+
142
+ 2. **Choose Your Approach**:
143
+ - For arithmetic/math: Use `calculator` or `python_repl`
144
+ - For current facts/events: Use `web_search`
145
+ - For historical/encyclopedic knowledge: Use `wikipedia_search`
146
+ - For attached files: Use `get_task_file`
147
+ - For complex logic/data processing: Use `python_repl`
148
+
149
+ 3. **Use Tools Effectively**:
150
+ - Search for key facts and verify information from multiple sources
151
+ - Extract relevant data from search results
152
+ - Perform calculations or transformations
153
+ - Cross-check results when possible
154
+
155
+ 4. **Format Your Final Answer**:
156
+ - For numbers: just the number (e.g., "42", "3.14", "-5")
157
+ - For text: exact text without extra punctuation (e.g., "Paris", "Monday")
158
+ - For lists: comma-separated values (e.g., "item1, item2, item3")
159
+ - For dates: use the format specified in the question
160
+ - If completely unsure: respond with just "Unknown"
161
+
162
+ 5. **End Response**:
163
+ After your reasoning, output a clean final answer on a new line:
164
+ FINAL ANSWER: <your answer>
165
+
166
+ ## Important Rules
167
+ - Never make up facts - always search or calculate
168
+ - Verify key numbers and spelling with web search
169
+ - If a calculation is involved, always show the work
170
+ - Be concise in your reasoning but thorough in verification
171
  """
172
 
173
  def _tool_to_openai_schema(t) -> dict:
 
197
 
198
  # Mappa nome → funzione tool per esecuzione
199
  self.tools_by_name = {t.name: t for t in self.tools_list}
200
+
201
+ hf_token = os.getenv("HF_TOKEN")
202
+ if not hf_token:
203
+ print("WARNING: HF_TOKEN non impostata. L'agente userà fallback locale e risposte molto limitate.")
204
+ self.client = None
205
+ else:
206
+ # InferenceClient diretto — usa la Serverless Inference API HF
207
+ self.client = InferenceClient(
208
+ api_key=hf_token,
209
+ )
210
 
211
  # Schema OpenAI dei tool per passarli al client
212
  self.tools_schema = [_tool_to_openai_schema(t) for t in self.tools_list]
 
260
  hf_messages = self._messages_to_hf_format([sys_msg] + state["messages"])
261
 
262
  response = self.client.chat_completion(
263
+ model="Qwen/Qwen2.5-72B-Instruct",
264
  messages=hf_messages,
265
  tools=self.tools_schema,
266
  tool_choice="auto",
267
+ max_tokens=1000,
268
+ temperature=0.1,
269
  )
270
 
271
  choice = response.choices[0].message
 
286
  tool_calls=tool_calls,
287
  )
288
  return {"messages": [ai_message]}
289
+
290
+ def _local_fallback_answer(self, question: str) -> str:
291
+ """
292
+ Minimal fallback when inference client is unavailable.
293
+ Attempts basic arithmetic only, otherwise returns Unknown.
294
+ """
295
+ q = question.lower().strip()
296
+
297
+ # Try simple arithmetic if it looks like a math problem
298
+ if re.search(r"(?:how\s+many|calculate|compute|what\s+is).*\d+", q):
299
+ try:
300
+ # Try to extract and evaluate a simple expression
301
+ numbers = re.findall(r"\d+\.?\d*", question)
302
+ if len(numbers) >= 2:
303
+ # Don't try to hardcode logic - just return Unknown
304
+ pass
305
+ except Exception:
306
+ pass
307
+
308
+ return "Unknown"
309
+
310
  def __call__(self, question: str) -> str:
311
+ print(f"Agent received question (first 100 chars): {question[:100]}...")
312
+
313
+ if self.client is None:
314
+ print("No HF InferenceClient configured; using local fallback logic")
315
+ return self._local_fallback_answer(question)
316
+
317
+ q_lower = question.lower().strip()
318
+
319
+ # Numeric math shortcut: explicit arithmetic detection reduces hallucination.
320
+ arithmetic = self._extract_arithmetic_expression(question)
321
+ if arithmetic:
322
+ calc = calculator(arithmetic)
323
+ if not calc.startswith("Calculation error"):
324
+ normalized = self._normalize_answer(calc)
325
+ print(f"Arithmetic shortcut using calculator: {arithmetic} -> {normalized}")
326
+ return normalized
327
+
328
+ # The main RL loop
329
+ answer = self._run_agent(question)
330
+
331
+ # Retry with explicit “Unknown” handling and chain-of-thought guidance
332
+ if answer == "Unknown":
333
+ print("Got Unknown on first pass; retrying with more explicit reasoning request")
334
+ replay_question = question + "\n\nPlease reason step by step with tool calls and provide FINAL ANSWER only." # gentle prompt nudge
335
+ answer = self._run_agent(replay_question)
336
+
337
+ print(f"Agent returning answer: '{answer}'")
338
+ return answer
339
+
340
+ def _run_agent(self, question: str) -> str:
341
  try:
342
+ result = self.graph.invoke({"messages": [HumanMessage(content=question)]})
343
+ last_message = result["messages"][-1]
344
+ response_text = last_message.content if isinstance(last_message, AIMessage) else str(last_message)
345
+ print(f"Agent raw output (first 300 chars): {response_text[:300]}...")
346
+ return self._extract_answer(response_text)
 
 
 
 
 
 
 
347
  except Exception as e:
348
+ error_message = str(e)
349
+ print(f"Agent error during run: {error_message}")
350
+ if "402" in error_message or "Payment Required" in error_message:
351
+ fallback = self._local_fallback_answer(question)
352
+ print(f"Payment required detected; using local fallback answer: {fallback}")
353
+ return fallback
354
+ return "Unknown"
355
+
356
+ def _extract_answer(self, text: str) -> str:
357
+ """
358
+ Extract the final answer from agent output using multiple strategies.
359
+ """
360
+ # Strategy 1: Look for explicit "FINAL ANSWER:" marker
361
+ match = re.search(r"FINAL ANSWER:\s*(.+?)(?:\n|$)", text, re.IGNORECASE)
362
+ if match:
363
+ answer = self._normalize_answer(match.group(1).strip())
364
+ if answer and answer != "Unknown":
365
+ return answer
366
+
367
+ # Strategy 2: Look at the last few lines
368
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
369
+ if lines:
370
+ for candidate in reversed(lines[-4:]):
371
+ if candidate and not any(phrase in candidate.lower() for phrase in ["i'm not sure", "error", "failed", "final answer"]):
372
+ normalized = self._normalize_answer(candidate)
373
+ if normalized and normalized != "Unknown":
374
+ return normalized
375
+
376
+ # Fallback
377
+ return "Unknown"
378
+
379
+ def _normalize_answer(self, answer: str) -> str:
380
+ """Normalize answer text (strip punctuation, normalize choices etc.)."""
381
+ answer_clean = answer.strip().strip('"\'').rstrip('.?,;')
382
+
383
+ if not answer_clean:
384
+ return "Unknown"
385
+
386
+ # Multiple-choice token: take plain option text if present
387
+ mc = re.match(r"^([A-D])\s*[:\)]\s*(.+)$", answer_clean, re.IGNORECASE)
388
+ if mc:
389
+ return mc.group(2).strip()
390
+
391
+ # Numeric decision: enforce numeric format for numeric questions
392
+ if re.match(r"^-?\d+(\.\d+)?$", answer_clean):
393
+ return answer_clean
394
+
395
+ return answer_clean
396
+
397
+ def _extract_arithmetic_expression(self, question: str) -> Optional[str]:
398
+ """Extract simple arithmetic expression candidate from a question for calculator use."""
399
+ m = re.search(r"([-+]?\d+(?:\.\d+)?(?:\s*[-+*/]\s*\d+(?:\.\d+)?)+)", question)
400
+ if not m:
401
+ return None
402
+ expr = m.group(1).replace("^", "**")
403
+ if re.search(r"[a-zA-Z]", expr):
404
+ return None
405
+ return expr
406
+
407
+ def _detect_question_type(self, question: str) -> str:
408
+ q = question.lower().strip()
409
+ if any(tok in q for tok in ["how many", "calculate", "compute", "sum", "difference", "times", "per cent", "%"]):
410
+ return "numeric"
411
+ if any(tok in q for tok in ["when", "year", "date", "born", "died"]):
412
+ return "date"
413
+ if any(tok in q for tok in ["which of the following", "option", "choose", "select"]):
414
+ return "multiple_choice"
415
+ return "factual"
416
+
417
 
418
  def run_and_submit_all( profile: gr.OAuthProfile | None):
419
  """
 
536
  return status_message, results_df
537
 
538
 
539
+ def build_gradio_ui():
540
+ with gr.Blocks() as demo:
541
+ gr.Markdown("# Basic Agent Evaluation Runner")
542
+ gr.Markdown(
543
+ """
544
+ **Instructions:**
545
 
546
+ 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
547
+ 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
548
+ 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
549
 
550
+ ---
551
+ **Disclaimers:**
552
+ Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
553
+ This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
554
+ """
555
+ )
556
 
557
+ gr.LoginButton()
558
 
559
+ run_button = gr.Button("Run Evaluation & Submit All Answers")
560
 
561
+ status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
562
+ results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
 
563
 
564
+ run_button.click(
565
+ fn=run_and_submit_all,
566
+ outputs=[status_output, results_table]
567
+ )
568
+
569
+ return demo
570
 
571
  if __name__ == "__main__":
572
  print("\n" + "-"*30 + " App Starting " + "-"*30)
573
+
574
  # Check for SPACE_HOST and SPACE_ID at startup for information
575
  space_host_startup = os.getenv("SPACE_HOST")
576
+ space_id_startup = os.getenv("SPACE_ID")
577
 
578
  if space_host_startup:
579
  print(f"✅ SPACE_HOST found: {space_host_startup}")
 
581
  else:
582
  print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
583
 
584
+ if space_id_startup:
585
  print(f"✅ SPACE_ID found: {space_id_startup}")
586
  print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
587
  print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
 
591
  print("-"*(60 + len(" App Starting ")) + "\n")
592
 
593
  print("Launching Gradio Interface for Basic Agent Evaluation...")
594
+ demo = build_gradio_ui()
595
+ demo.launch(debug=True, share=False)