jesusvilela commited on
Commit
949751a
·
verified ·
1 Parent(s): f1496a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +226 -159
app.py CHANGED
@@ -1,3 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # --- Imports ---
2
  import os
3
  import gradio as gr
@@ -12,8 +26,6 @@ import mimetypes
12
  import subprocess # For yt-dlp
13
  import io # For BytesIO with PIL
14
 
15
- # Removed: from huggingface_hub import get_space_runtime - not used for username with OAuth
16
-
17
  # --- Global Variables for Startup Status ---
18
  missing_vars_startup_list_global = []
19
  agent_pre_init_status_msg_global = "Agent status will be determined at startup."
@@ -21,16 +33,16 @@ agent_pre_init_status_msg_global = "Agent status will be determined at startup."
21
  # File Processing Libs
22
  try: from PyPDF2 import PdfReader; PYPDF2_AVAILABLE = True
23
  except ImportError: PYPDF2_AVAILABLE = False; print("WARNING: PyPDF2 not found, PDF tool will be disabled.")
24
- try: from PIL import Image; import pytesseract; PIL_TESSERACT_AVAILABLE = True # PIL is needed for new tool
25
  except ImportError: PIL_TESSERACT_AVAILABLE = False; print("WARNING: Pillow or Pytesseract not found, OCR tool will be disabled.")
26
  try: import whisper; WHISPER_AVAILABLE = True
27
  except ImportError: WHISPER_AVAILABLE = False; print("WARNING: OpenAI Whisper not found, Audio Transcription tool will be disabled.")
28
 
29
- # Google GenAI (Used by LangChain integration)
30
- from google.genai.types import HarmCategory, HarmBlockThreshold
31
 
32
  # LangChain
33
- from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, ToolMessage # Removed AnyMessage
34
  from langchain.prompts import PromptTemplate
35
  from langchain.tools import BaseTool, tool as lc_tool_decorator
36
  from langchain_google_genai import ChatGoogleGenerativeAI
@@ -41,13 +53,13 @@ from langchain_experimental.tools import PythonREPLTool
41
  # LangGraph Conditional Imports
42
  if TYPE_CHECKING:
43
  from langgraph.graph import StateGraph as StateGraphAliasedForHinting
44
- from langgraph.prebuilt import ToolExecutor as ToolExecutorAliasedForHinting
45
  from typing_extensions import TypedDict
46
  from langgraph.checkpoint.base import BaseCheckpointSaver
47
 
48
  LANGGRAPH_FLAVOR_AVAILABLE = False
49
  LG_StateGraph: Optional[Type[Any]] = None
50
- LG_ToolExecutor: Optional[Type[Any]] = None
51
  LG_END: Optional[Any] = None
52
  LG_ToolInvocation: Optional[Type[Any]] = None
53
  add_messages: Optional[Any] = None
@@ -55,40 +67,56 @@ MemorySaver_Class: Optional[Type[Any]] = None
55
 
56
  AGENT_INSTANCE: Optional[Union[AgentExecutor, Any]] = None
57
  TOOLS: List[BaseTool] = []
58
- LLM_INSTANCE: Optional[ChatGoogleGenerativeAI] = None # This is the agent's "planner"
59
  LANGGRAPH_MEMORY_SAVER: Optional[Any] = None
60
 
61
- # --- google-genai Client SDK (for the new direct multimodal tool) ---
62
  from google import genai as google_genai_sdk
63
- google_genai_client: Optional[google_genai_sdk.Client] = None # Initialized later
64
- # --- End google-genai Client SDK section ---
65
 
66
  try:
67
  from langgraph.graph import StateGraph, END
68
- from langgraph.prebuilt import ToolExecutor, ToolInvocation as LGToolInvocationActual
69
- from langgraph.graph.message import add_messages as lg_add_messages
70
- from langgraph.checkpoint.memory import MemorySaver as LGMemorySaver
71
- LANGGRAPH_FLAVOR_AVAILABLE = True
72
- LG_StateGraph, LG_ToolExecutor, LG_END, LG_ToolInvocation, add_messages, MemorySaver_Class = \
73
- StateGraph, ToolExecutor, END, LGToolInvocationActual, lg_add_messages, LGMemorySaver
74
- print("Successfully imported LangGraph components.")
75
- except ImportError as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  LANGGRAPH_FLAVOR_AVAILABLE = False
77
- # Assign None to all to prevent NameError if used before assignment
78
- LG_StateGraph, LG_ToolExecutor, LG_END, LG_ToolInvocation, add_messages, MemorySaver_Class = (None,) * 6
79
- print(f"WARNING: LangGraph components not found or import error: {e}. LangGraph agent will be disabled.")
80
 
81
  # --- Constants ---
82
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
83
- # GEMINI_MODEL_NAME is for the agent's planner LLM (LangChain)
84
- GEMINI_MODEL_NAME = "gemini-2.5-pro-preview-05-06" # Retained from original for planner
85
- # GEMINI_FLASH_MULTIMODAL_MODEL_NAME is for the new direct multimodal tool (google-genai client SDK)
86
  GEMINI_FLASH_MULTIMODAL_MODEL_NAME = "gemini-2.0-flash-exp"
87
-
88
  SCORING_API_BASE_URL = os.getenv("SCORING_API_URL", DEFAULT_API_URL)
89
  MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024
90
  LOCAL_FILE_STORE_PATH = "./Data"
91
- os.makedirs(LOCAL_FILE_STORE_PATH, exist_ok=True) # Create data directory at startup
92
 
93
  # --- Global State ---
94
  WHISPER_MODEL: Optional[Any] = None
@@ -108,13 +136,13 @@ if GOOGLE_API_KEY:
108
  logger.info("google-genai SDK Client initialized successfully.")
109
  except Exception as e:
110
  logger.error(f"Failed to initialize google-genai SDK Client: {e}")
111
- google_genai_client = None # Ensure it's None if init fails
112
  else:
113
- logger.warning("GOOGLE_API_KEY not found. google-genai SDK Client (for direct multimodal tool) not initialized.")
114
- # --- End Initialize google-genai Client SDK ---
115
 
116
- # --- Helper Functions (Unchanged from your original) ---
117
  def _strip_exact_match_answer(text: Any) -> str:
 
118
  if not isinstance(text, str): text = str(text)
119
  text_lower_check = text.lower()
120
  if text_lower_check.startswith("final answer:"):
@@ -132,15 +160,17 @@ def _strip_exact_match_answer(text: Any) -> str:
132
  return text.strip()
133
 
134
  def _is_full_url(url_string: str) -> bool:
 
135
  try: result = urlparse(url_string); return all([result.scheme, result.netloc])
136
  except ValueError: return False
137
 
138
  def _is_youtube_url(url: str) -> bool:
 
139
  parsed_url = urlparse(url)
140
  return parsed_url.netloc.lower().endswith(("youtube.com", "youtu.be"))
141
 
142
  def _download_file(file_identifier: str, task_id_for_file: Optional[str] = None) -> str:
143
- # ... (Your original _download_file function - unchanged)
144
  os.makedirs(LOCAL_FILE_STORE_PATH, exist_ok=True)
145
  logger.debug(f"Download request: '{file_identifier}', task_id: {task_id_for_file}")
146
  original_filename = os.path.basename(urlparse(file_identifier).path) if _is_full_url(file_identifier) else os.path.basename(file_identifier)
@@ -221,8 +251,8 @@ def _download_file(file_identifier: str, task_id_for_file: Optional[str] = None)
221
 
222
  if effective_save_path != tentative_local_path and os.path.exists(effective_save_path) and os.path.getsize(effective_save_path) > 0:
223
  logger.info(f"Cached file (CD name): {effective_save_path}"); return effective_save_path
224
- with open(effective_save_path, "wb") as f:
225
- for chunk in r.iter_content(chunk_size=1024*1024): f.write(chunk)
226
  logger.info(f"File downloaded to {effective_save_path}"); return effective_save_path
227
  except requests.exceptions.HTTPError as e:
228
  err_msg = f"HTTP {e.response.status_code} for {file_url_to_try}. Detail: {e.response.text[:100]}"
@@ -230,7 +260,7 @@ def _download_file(file_identifier: str, task_id_for_file: Optional[str] = None)
230
  except Exception as e:
231
  logger.error(f"Download error for {file_url_to_try}: {e}", exc_info=True); return f"Error: {str(e)[:100]}"
232
 
233
- # --- Tool Function Definitions (Original tools unchanged) ---
234
  READ_PDF_TOOL_DESC = "Reads text content from a PDF file. Input: JSON '{\"file_identifier\": \"FILENAME_OR_URL\", \"task_id\": \"TASK_ID_IF_GAIA_FILENAME_ONLY\"}'. Returns extracted text."
235
  @lc_tool_decorator(description=READ_PDF_TOOL_DESC)
236
  def read_pdf_tool(action_input_json_str: str) -> str:
@@ -242,19 +272,18 @@ def read_pdf_tool(action_input_json_str: str) -> str:
242
  path = _download_file(file_id, task_id)
243
  if path.startswith("Error:"): return path
244
  try:
245
- text = "";
246
- with open(path, "rb") as f:
247
- reader = PdfReader(f)
248
  if reader.is_encrypted:
249
  try: reader.decrypt('')
250
  except: return f"Error: PDF '{path}' encrypted."
251
  for page_num in range(len(reader.pages)):
252
  page = reader.pages[page_num]
253
- text += page.extract_text() + "\n\n"
254
- return text[:40000]
255
  except Exception as e: return f"Error reading PDF '{path}': {e}"
256
 
257
-
258
  OCR_IMAGE_TOOL_DESC = "Extracts text from an image using OCR. Input: JSON '{\"file_identifier\": \"FILENAME_OR_URL\", \"task_id\": \"TASK_ID_IF_GAIA_FILENAME_ONLY\"}'. Returns extracted text."
259
  @lc_tool_decorator(description=OCR_IMAGE_TOOL_DESC)
260
  def ocr_image_tool(action_input_json_str: str) -> str:
@@ -285,143 +314,189 @@ def transcribe_audio_tool(action_input_json_str: str) -> str:
285
  try: result = WHISPER_MODEL.transcribe(path, fp16=False); return result["text"][:40000] # type: ignore
286
  except Exception as e: logger.error(f"Whisper error on '{path}': {e}", exc_info=True); return f"Error transcribing '{path}': {e}"
287
 
288
- # +++ NEW TOOL using google-genai Client SDK for Multimodal Prompts +++
289
  DIRECT_MULTIMODAL_GEMINI_TOOL_DESC = (
290
  "Processes an image file (URL or local path) along with a text prompt using a Gemini multimodal model (gemini-2.0-flash-exp) "
291
- "for tasks like image description, answering questions about the image, or generating text based on the image. "
292
- "Input: JSON '{\"file_identifier\": \"IMAGE_FILENAME_OR_URL\", \"text_prompt\": \"Your question or instruction related to the image.\", \"task_id\": \"TASK_ID_IF_GAIA_FILENAME_ONLY\" (optional)}'. "
293
  "Returns the model's text response."
294
  )
295
  @lc_tool_decorator(description=DIRECT_MULTIMODAL_GEMINI_TOOL_DESC)
296
  def direct_multimodal_gemini_tool(action_input_json_str: str) -> str:
297
- global google_genai_client # Use the initialized client
298
- if not google_genai_client:
299
- return "Error: google-genai SDK client not initialized. GOOGLE_API_KEY might be missing."
300
- if not PIL_TESSERACT_AVAILABLE : # Check if PIL is available, as it's used to open the image
301
- return "Error: Pillow (PIL) library is not available for image processing."
302
-
303
  try:
304
  data = json.loads(action_input_json_str)
305
  file_identifier = data.get("file_identifier")
306
- text_prompt = data.get("text_prompt", "Describe this image.") # Default prompt
307
- task_id = data.get("task_id") # Optional, for _download_file if needed
308
-
309
- if not file_identifier:
310
- return "Error: 'file_identifier' for the image is missing in the input."
311
-
312
- logger.info(f"Direct Multimodal Tool: Processing image '{file_identifier}' with prompt '{text_prompt}'")
313
-
314
- # Download the file to a local path (handles URLs and GAIA files)
315
  local_image_path = _download_file(file_identifier, task_id)
316
- if local_image_path.startswith("Error:"):
317
- return f"Error downloading image for Direct Multimodal Tool: {local_image_path}"
318
-
319
- # Open the image using Pillow
320
  try:
321
  pil_image = Image.open(local_image_path)
322
- pil_image.thumbnail((1024, 1024)) # Optional: resize large images
323
- except Exception as e_img:
324
- logger.error(f"Error opening image at {local_image_path}: {e_img}")
325
- return f"Error opening image file {local_image_path}: {str(e_img)}"
326
-
327
- # Send to Gemini Flash model using the client SDK
328
  response = google_genai_client.models.generate_content(
329
- model=GEMINI_FLASH_MULTIMODAL_MODEL_NAME, # Use the specified Flash model
330
- contents=[pil_image, text_prompt] # Pass PIL image and text prompt
331
  )
332
- logger.info(f"Direct Multimodal Tool: Response received from {GEMINI_FLASH_MULTIMODAL_MODEL_NAME}.")
333
- return response.text[:40000] # Return model's text response, truncated if very long
334
-
335
- except json.JSONDecodeError as e_json:
336
- return f"Error parsing JSON input for Direct Multimodal Tool: {str(e_json)}. Input was: {action_input_json_str}"
337
- except Exception as e_tool:
338
- logger.error(f"Error in direct_multimodal_gemini_tool: {e_tool}", exc_info=True)
339
- return f"Error executing Direct Multimodal Tool: {str(e_tool)}"
340
- # +++ END NEW TOOL +++
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
 
343
- # --- Agent Prompts (Slightly updated to include the new tool name if available) ---
344
- # (Agent prompts remain largely the same, the agent will learn to use tools from their descriptions)
345
-
346
  # --- Agent Initialization and Response Logic ---
347
  def initialize_agent_and_tools(force_reinit=False):
348
- global AGENT_INSTANCE, TOOLS, LLM_INSTANCE, LANGGRAPH_FLAVOR_AVAILABLE, LG_StateGraph, LG_ToolExecutor, LG_END, LG_ToolInvocation, add_messages, MemorySaver_Class, LANGGRAPH_MEMORY_SAVER, google_genai_client
349
  if AGENT_INSTANCE and not force_reinit: logger.info("Agent already initialized."); return
350
  logger.info("Initializing agent and tools...")
351
  if not GOOGLE_API_KEY: raise ValueError("GOOGLE_API_KEY not set for LangChain LLM.")
352
 
353
- # Initialize LangChain LLM (Planner)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  try:
355
- LLM_INSTANCE = ChatGoogleGenerativeAI(model=GEMINI_MODEL_NAME, google_api_key=GOOGLE_API_KEY, temperature=0.0,
356
- safety_settings={HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
357
- HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,},
358
- request_timeout=120, convert_system_message_to_human=True )
 
 
 
 
359
  logger.info(f"LangChain LLM (Planner) initialized: {GEMINI_MODEL_NAME}")
360
- except Exception as e: logger.error(f"LangChain LLM init failed: {e}", exc_info=True); raise
 
 
361
 
362
  TOOLS = []
363
  if PYPDF2_AVAILABLE: TOOLS.append(read_pdf_tool)
364
  if PIL_TESSERACT_AVAILABLE: TOOLS.append(ocr_image_tool)
365
  if WHISPER_AVAILABLE: TOOLS.append(transcribe_audio_tool)
366
-
367
- # Add the new direct multimodal tool if its dependencies (client, PIL) are met
368
- if google_genai_client and PIL_TESSERACT_AVAILABLE: # PIL_TESSERACT_AVAILABLE implies PIL is available
369
- TOOLS.append(direct_multimodal_gemini_tool)
370
- logger.info("Added 'direct_multimodal_gemini_tool'.")
371
- else:
372
- logger.warning("'direct_multimodal_gemini_tool' NOT added due to missing google_genai_client or PIL.")
373
-
374
  try: search_tool = DuckDuckGoSearchRun(name="web_search"); search_tool.description = "Web search. Input: query."; TOOLS.append(search_tool)
375
  except Exception as e: logger.warning(f"DuckDuckGoSearchRun init failed: {e}")
376
  try: python_repl = PythonREPLTool(name="python_repl"); python_repl.description = "Python REPL. print() for output."; TOOLS.append(python_repl)
377
  except Exception as e: logger.warning(f"PythonREPLTool init failed: {e}")
378
  logger.info(f"Final tools list for agent: {[t.name for t in TOOLS]}")
379
 
380
-
381
- # ... (Rest of your initialize_agent_and_tools function for LangGraph/ReAct setup - unchanged)
382
- if LANGGRAPH_FLAVOR_AVAILABLE and all([LG_StateGraph, LG_ToolExecutor, LG_END, LLM_INSTANCE, LG_ToolInvocation, add_messages]):
383
  if not LANGGRAPH_MEMORY_SAVER and MemorySaver_Class: LANGGRAPH_MEMORY_SAVER = MemorySaver_Class(); logger.info("LangGraph MemorySaver initialized.")
384
  try:
385
- logger.info(f"Attempting LangGraph init (Memory: {LANGGRAPH_MEMORY_SAVER is not None})")
386
  _TypedDict = getattr(__import__('typing_extensions'), 'TypedDict', dict)
387
- class AgentState(_TypedDict): input: str; messages: Annotated[List[Any], add_messages] # Use Any for AnyMessage for broader compatibility
388
 
389
- prompt_content_lg = LANGGRAPH_PROMPT_TEMPLATE_STR.format(
390
- tools="\n".join([f"- {t.name}: {t.description}" for t in TOOLS]),
391
- input="{input}"
392
  )
393
  def agent_node(state: AgentState):
394
- current_input_lg = state.get('input', '')
395
- formatted_system_prompt_lg = prompt_content_lg.replace("{input}", current_input_lg)
396
- messages_for_llm_lg = [SystemMessage(content=formatted_system_prompt_lg)] + state.get('messages', [])
397
- bound_llm_for_tools_lg = LLM_INSTANCE.bind_tools(TOOLS)
398
- response_from_llm_lg = bound_llm_for_tools_lg.invoke(messages_for_llm_lg)
399
- return {"messages": [response_from_llm_lg]}
400
-
401
- tool_executor_lg_instance = LG_ToolExecutor(TOOLS) # type: ignore
402
- def tool_node(state: AgentState):
403
- last_msg_lg = state['messages'][-1] if state.get('messages') and isinstance(state['messages'][-1], AIMessage) else None
404
- if not last_msg_lg or not last_msg_lg.tool_calls: return {"messages": []}
405
- tool_results_lg = []
406
- for tc_lg in last_msg_lg.tool_calls:
407
- name_lg, args_lg, tc_id_lg = tc_lg.get('name'), tc_lg.get('args'), tc_lg.get('id')
408
- if not all([name_lg, isinstance(args_lg, dict), tc_id_lg]):
409
- err_msg_lg=f"Invalid tool_call: {tc_lg}"; logger.error(err_msg_lg)
410
- tool_results_lg.append(ToolMessage(f"Error: {err_msg_lg}", tool_call_id=tc_id_lg or "error_id", name=name_lg or "error_tool"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  continue
412
  try:
413
- logger.info(f"LG Tool Invoking: '{name_lg}' with {args_lg} (ID: {tc_id_lg})")
414
- tool_invocation_obj_lg = LG_ToolInvocation(tool=name_lg, tool_input=args_lg) # type: ignore
415
- output_lg = tool_executor_lg_instance.invoke(tool_invocation_obj_lg)
416
- tool_results_lg.append(ToolMessage(content=str(output_lg), tool_call_id=tc_id_lg, name=name_lg))
 
417
  except Exception as e_tool_node_lg:
418
- logger.error(f"LG Tool Error ('{name_lg}'): {e_tool_node_lg}", exc_info=True)
419
- tool_results_lg.append(ToolMessage(content=f"Error for tool {name_lg}: {str(e_tool_node_lg)}", tool_call_id=tc_id_lg, name=name_lg))
420
- return {"messages": tool_results_lg}
 
421
 
422
  workflow_lg = LG_StateGraph(AgentState) # type: ignore
423
  workflow_lg.add_node("agent", agent_node)
424
- workflow_lg.add_node("tools", tool_node)
 
 
425
  workflow_lg.set_entry_point("agent")
426
  def should_continue_lg(state: AgentState): return "tools" if state['messages'][-1].tool_calls else LG_END
427
  workflow_lg.add_conditional_edges("agent", should_continue_lg, {"tools": "tools", LG_END: LG_END}) # type: ignore
@@ -450,11 +525,9 @@ def initialize_agent_and_tools(force_reinit=False):
450
  if not AGENT_INSTANCE: raise RuntimeError("CRITICAL: Agent initialization completely failed.")
451
  logger.info(f"Agent init finished. Active agent type: {type(AGENT_INSTANCE).__name__}")
452
 
453
-
454
- # --- get_agent_response, construct_prompt_for_agent, run_and_submit_all ---
455
- # --- These functions remain UNCHANGED from your original code ---
456
  def get_agent_response(prompt: str, task_id: Optional[str]=None, thread_id: Optional[str]=None) -> str:
457
- # ... (Your original get_agent_response logic) ...
458
  global AGENT_INSTANCE, LLM_INSTANCE
459
  thread_id_to_use = thread_id or (f"gaia_task_{task_id}" if task_id else hashlib.md5(prompt.encode()).hexdigest()[:8])
460
  if not AGENT_INSTANCE or not LLM_INSTANCE:
@@ -470,7 +543,7 @@ def get_agent_response(prompt: str, task_id: Optional[str]=None, thread_id: Opti
470
  logger.debug(f"Using LangGraph agent (Memory: {LANGGRAPH_MEMORY_SAVER is not None}) for thread: {thread_id_to_use}")
471
  initial_messages_lg_get = []
472
  input_for_lg_get = {"input": prompt, "messages": initial_messages_lg_get}
473
- final_state_lg_get = AGENT_INSTANCE.invoke(input_for_lg_get, {"configurable": {"thread_id": thread_id_to_use}})
474
  if not final_state_lg_get or 'messages' not in final_state_lg_get or not final_state_lg_get['messages']:
475
  logger.error("LangGraph: No final state/messages."); return "[ERROR] LangGraph: No final state/messages."
476
  for message_item_lg_get in reversed(final_state_lg_get['messages']):
@@ -489,7 +562,7 @@ def get_agent_response(prompt: str, task_id: Optional[str]=None, thread_id: Opti
489
  return f"[ERROR] Agent execution failed: {str(e_agent_run_get)[:150]}"
490
 
491
  def construct_prompt_for_agent(q: Dict[str,Any]) -> str:
492
- # ... (Your original construct_prompt_for_agent logic) ...
493
  tid,q_str=q.get("task_id","N/A"),q.get("question",""); files=q.get("files",[])
494
  files_info = ("\nFiles:\n"+"\n".join([f"- {f} (task_id:{tid})"for f in files])) if files else ""
495
  level = f"\nLevel:{q.get('level')}" if q.get('level') else ""
@@ -500,19 +573,16 @@ def run_and_submit_all(profile: Optional[gr.OAuthProfile] = None):
500
  global AGENT_INSTANCE
501
  space_id = os.getenv("SPACE_ID")
502
  username_for_submission = None
503
-
504
  if profile and hasattr(profile, 'username') and profile.username:
505
  username_for_submission = profile.username
506
  logger.info(f"Username from OAuth profile: {username_for_submission}")
507
  else:
508
  logger.warning("OAuth profile not available or username missing.")
509
  return "Hugging Face login required. Please use the login button and try again.", None
510
-
511
  if AGENT_INSTANCE is None:
512
  try: logger.info("Agent not pre-initialized. Initializing for run..."); initialize_agent_and_tools()
513
  except Exception as e: return f"Agent on-demand initialization failed: {e}", None
514
  if AGENT_INSTANCE is None: return "Agent is still None after on-demand init.", None
515
-
516
  agent_code_url_run=f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "local_dev_run"
517
  questions_url_run,submit_url_run=f"{DEFAULT_API_URL}/questions",f"{DEFAULT_API_URL}/submit"
518
  auth_headers_run={"Authorization":f"Bearer {HUGGINGFACE_TOKEN}"} if HUGGINGFACE_TOKEN else {}
@@ -522,7 +592,6 @@ def run_and_submit_all(profile: Optional[gr.OAuthProfile] = None):
522
  if not questions_data_run or not isinstance(questions_data_run,list):logger.error(f"Invalid questions data: {questions_data_run}");return "Fetched questions_data invalid.",None
523
  logger.info(f"Fetched {len(questions_data_run)} questions.")
524
  except Exception as e:logger.error(f"Fetch questions error: {e}",exc_info=True);return f"Fetch questions error:{e}",None
525
-
526
  results_log_run,answers_payload_run=[],[]
527
  logger.info(f"Running agent on {len(questions_data_run)} questions for user '{username_for_submission}'...")
528
  for i,item_run in enumerate(questions_data_run):
@@ -538,7 +607,6 @@ def run_and_submit_all(profile: Optional[gr.OAuthProfile] = None):
538
  logger.error(f"Agent error task {task_id_run}:{e}",exc_info=True);error_answer_run=f"AGENT ERROR:{str(e)[:100]}"
539
  answers_payload_run.append({"task_id":task_id_run,"submitted_answer":"N/A [AGENT_ERROR]"})
540
  results_log_run.append({"Task ID":task_id_run,"Question":question_text_run,"Full Agent Prompt":prompt_run,"Raw Agent Output":error_answer_run,"Submitted Answer":"N/A [AGENT_ERROR]"})
541
-
542
  if not answers_payload_run:return "Agent produced no answers.",pd.DataFrame(results_log_run)
543
  submission_payload_run={"username":username_for_submission.strip(),"agent_code":agent_code_url_run,"answers":answers_payload_run}
544
  logger.info(f"Submitting {len(answers_payload_run)} answers to {submit_url_run} for user '{username_for_submission}'...")
@@ -551,10 +619,8 @@ def run_and_submit_all(profile: Optional[gr.OAuthProfile] = None):
551
  error_http_run=f"HTTP {e.response.status_code}. Detail:{e.response.text[:200]}"; logger.error(f"Submit Fail:{error_http_run}",exc_info=True); return f"Submit Fail:{error_http_run}",pd.DataFrame(results_log_run)
552
  except Exception as e:logger.error(f"Submit Fail unexpected:{e}",exc_info=True);return f"Submit Fail:{str(e)[:100]}",pd.DataFrame(results_log_run)
553
 
554
-
555
- # --- Build Gradio Interface (Unchanged from your original) ---
556
  with gr.Blocks(css=".gradio-container {max-width:1280px !important;margin:auto !important;}",theme=gr.themes.Soft()) as demo:
557
- # ... (Your original Gradio UI layout - unchanged) ...
558
  gr.Markdown("# GAIA Agent Challenge Runner v7 (OAuth for Username)")
559
  gr.Markdown(f"""**Instructions:**
560
  1. **Login with Hugging Face** using the button below. Your HF username will be used for submission.
@@ -569,12 +635,11 @@ with gr.Blocks(css=".gradio-container {max-width:1280px !important;margin:auto !
569
  gr.LoginButton()
570
  run_button = gr.Button("Run Evaluation & Submit All Answers")
571
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=7, interactive=False)
572
- results_table = gr.DataFrame(label="Q&A Log", headers=["Task ID","Question","Prompt","Raw","Submitted"], wrap=True)
573
 
574
  run_button.click(fn=run_and_submit_all, outputs=[status_output,results_table], api_name="run_evaluation")
575
 
576
  def update_ui_on_load_fn_within_context():
577
- # ... (Your original update_ui_on_load_fn_within_context logic - unchanged) ...
578
  global missing_vars_startup_list_global, agent_pre_init_status_msg_global
579
  secrets_msg_md = ""
580
  if missing_vars_startup_list_global:
@@ -587,20 +652,21 @@ with gr.Blocks(css=".gradio-container {max-width:1280px !important;margin:auto !
587
  if env_issues: secrets_msg_md += f"<br/><font color='orange'>**Tool Deps Missing:** {', '.join(env_issues)}.</font>"
588
  current_status_md = agent_pre_init_status_msg_global
589
  if not LANGGRAPH_FLAVOR_AVAILABLE and "LangGraph" not in current_status_md:
590
- current_status_md += " (LangGraph core import failed, ReAct fallback.)"
 
 
591
  return { agent_status_display: gr.Markdown(value=current_status_md),
592
  missing_secrets_display: gr.Markdown(value=secrets_msg_md) }
593
 
594
  demo.load(update_ui_on_load_fn_within_context, [], [agent_status_display, missing_secrets_display])
595
 
596
  if __name__ == "__main__":
597
- # ... (Your original __main__ block for startup logging and pre-initialization - unchanged) ...
598
- logger.info("Application starting up (v7 with Direct Multimodal Tool)...")
599
  if not PYPDF2_AVAILABLE: logger.warning("PyPDF2 (PDF tool) NOT AVAILABLE.")
600
- if not PIL_TESSERACT_AVAILABLE: logger.warning("Pillow/Pytesseract (OCR tool) NOT AVAILABLE.") # PIL also needed for new tool
601
  if not WHISPER_AVAILABLE: logger.warning("Whisper (Audio tool) NOT AVAILABLE.")
602
- if LANGGRAPH_FLAVOR_AVAILABLE: logger.info("Core LangGraph (StateGraph, END) loaded.")
603
- else: logger.warning("Core LangGraph FAILED import. ReAct fallback. Check requirements & Space build logs.")
604
 
605
  missing_vars_startup_list_global.clear()
606
  if not GOOGLE_API_KEY: missing_vars_startup_list_global.append("GOOGLE_API_KEY")
@@ -608,17 +674,18 @@ if __name__ == "__main__":
608
 
609
  try:
610
  logger.info("Pre-initializing agent...")
611
- initialize_agent_and_tools() # This will now include the new direct_multimodal_gemini_tool
612
  if AGENT_INSTANCE:
613
  agent_type_name = type(AGENT_INSTANCE).__name__
614
  agent_pre_init_status_msg_global = f"Agent Pre-initialized: **{agent_type_name}**."
615
- if LANGGRAPH_FLAVOR_AVAILABLE and "StateGraph" in agent_type_name: # More robust check
616
- agent_pre_init_status_msg_global = f"Agent Pre-initialized: **LangGraph** (Memory: {LANGGRAPH_MEMORY_SAVER is not None})."
 
617
  else: agent_pre_init_status_msg_global = "Agent pre-init FAILED (AGENT_INSTANCE is None)."
618
  logger.info(agent_pre_init_status_msg_global.replace("**",""))
619
  except Exception as e:
620
- agent_pre_init_status_msg_global = f"Agent pre-init CRASHED: {str(e)[:100]}."
621
- logger.critical(f"Agent pre-init CRASHED: {e}", exc_info=True)
622
 
623
  logger.info(f"Space ID: {os.getenv('SPACE_ID', 'Not Set')}")
624
  logger.info("Gradio Interface launching...")
 
1
+ # Copyright 2025 Jesus Vilela Jato.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
  # --- Imports ---
16
  import os
17
  import gradio as gr
 
26
  import subprocess # For yt-dlp
27
  import io # For BytesIO with PIL
28
 
 
 
29
  # --- Global Variables for Startup Status ---
30
  missing_vars_startup_list_global = []
31
  agent_pre_init_status_msg_global = "Agent status will be determined at startup."
 
33
  # File Processing Libs
34
  try: from PyPDF2 import PdfReader; PYPDF2_AVAILABLE = True
35
  except ImportError: PYPDF2_AVAILABLE = False; print("WARNING: PyPDF2 not found, PDF tool will be disabled.")
36
+ try: from PIL import Image; import pytesseract; PIL_TESSERACT_AVAILABLE = True
37
  except ImportError: PIL_TESSERACT_AVAILABLE = False; print("WARNING: Pillow or Pytesseract not found, OCR tool will be disabled.")
38
  try: import whisper; WHISPER_AVAILABLE = True
39
  except ImportError: WHISPER_AVAILABLE = False; print("WARNING: OpenAI Whisper not found, Audio Transcription tool will be disabled.")
40
 
41
+ # Google GenAI (Used by LangChain integration AND direct client)
42
+ from google.generativeai.types import HarmCategory, HarmBlockThreshold
43
 
44
  # LangChain
45
+ from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, ToolMessage
46
  from langchain.prompts import PromptTemplate
47
  from langchain.tools import BaseTool, tool as lc_tool_decorator
48
  from langchain_google_genai import ChatGoogleGenerativeAI
 
53
  # LangGraph Conditional Imports
54
  if TYPE_CHECKING:
55
  from langgraph.graph import StateGraph as StateGraphAliasedForHinting
56
+ from langgraph.prebuilt import ToolNode as ToolExecutorAliasedForHinting # Prefer ToolNode
57
  from typing_extensions import TypedDict
58
  from langgraph.checkpoint.base import BaseCheckpointSaver
59
 
60
  LANGGRAPH_FLAVOR_AVAILABLE = False
61
  LG_StateGraph: Optional[Type[Any]] = None
62
+ LG_ToolExecutor_Class: Optional[Type[Any]] = None # Store the class (ToolNode or ToolExecutor)
63
  LG_END: Optional[Any] = None
64
  LG_ToolInvocation: Optional[Type[Any]] = None
65
  add_messages: Optional[Any] = None
 
67
 
68
  AGENT_INSTANCE: Optional[Union[AgentExecutor, Any]] = None
69
  TOOLS: List[BaseTool] = []
70
+ LLM_INSTANCE: Optional[ChatGoogleGenerativeAI] = None
71
  LANGGRAPH_MEMORY_SAVER: Optional[Any] = None
72
 
73
+ # google-genai Client SDK
74
  from google import genai as google_genai_sdk
75
+ google_genai_client: Optional[google_genai_sdk.Client] = None
 
76
 
77
  try:
78
  from langgraph.graph import StateGraph, END
79
+ try:
80
+ from langgraph.prebuilt import ToolNode # Common in newer langgraph
81
+ LG_ToolExecutor_Class = ToolNode # Assign ToolNode class
82
+ print("Using langgraph.prebuilt.ToolNode for LangGraph tool execution.")
83
+ except ImportError:
84
+ try:
85
+ from langgraph.prebuilt import ToolExecutor # Original attempt
86
+ LG_ToolExecutor_Class = ToolExecutor
87
+ print("Using langgraph.prebuilt.ToolExecutor (fallback) for LangGraph tool execution.")
88
+ except ImportError as e_lg_exec_inner:
89
+ print(f"Failed to import ToolNode and ToolExecutor from langgraph.prebuilt: {e_lg_exec_inner}")
90
+ LG_ToolExecutor_Class = None
91
+
92
+ if LG_ToolExecutor_Class is not None: # Proceed only if a tool executor class was found
93
+ from langgraph.prebuilt import ToolInvocation as LGToolInvocationActual
94
+ from langgraph.graph.message import add_messages as lg_add_messages
95
+ from langgraph.checkpoint.memory import MemorySaver as LGMemorySaver
96
+ LANGGRAPH_FLAVOR_AVAILABLE = True
97
+ LG_StateGraph, LG_END, LG_ToolInvocation, add_messages, MemorySaver_Class = \
98
+ StateGraph, END, LGToolInvocationActual, lg_add_messages, LGMemorySaver
99
+ print("Successfully imported LangGraph components.")
100
+ else:
101
+ # This ensures LANGGRAPH_FLAVOR_AVAILABLE remains False if no executor was found
102
+ LANGGRAPH_FLAVOR_AVAILABLE = False
103
+ LG_StateGraph, LG_END, LG_ToolInvocation, add_messages, MemorySaver_Class = (None,) * 5
104
+ print(f"WARNING: No suitable LangGraph tool executor (ToolNode/ToolExecutor) found. LangGraph agent will be disabled.")
105
+
106
+ except ImportError as e: # Catch import error for StateGraph, END itself
107
  LANGGRAPH_FLAVOR_AVAILABLE = False
108
+ LG_StateGraph, LG_ToolExecutor_Class, LG_END, LG_ToolInvocation, add_messages, MemorySaver_Class = (None,) * 6
109
+ print(f"WARNING: Core LangGraph components (StateGraph, END) not found or import error: {e}. LangGraph agent will be disabled.")
110
+
111
 
112
  # --- Constants ---
113
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
114
+ GEMINI_MODEL_NAME = "gemini-2.5-pro-preview-05-06"
 
 
115
  GEMINI_FLASH_MULTIMODAL_MODEL_NAME = "gemini-2.0-flash-exp"
 
116
  SCORING_API_BASE_URL = os.getenv("SCORING_API_URL", DEFAULT_API_URL)
117
  MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024
118
  LOCAL_FILE_STORE_PATH = "./Data"
119
+ os.makedirs(LOCAL_FILE_STORE_PATH, exist_ok=True)
120
 
121
  # --- Global State ---
122
  WHISPER_MODEL: Optional[Any] = None
 
136
  logger.info("google-genai SDK Client initialized successfully.")
137
  except Exception as e:
138
  logger.error(f"Failed to initialize google-genai SDK Client: {e}")
139
+ google_genai_client = None
140
  else:
141
+ logger.warning("GOOGLE_API_KEY not found. google-genai SDK Client not initialized.")
 
142
 
143
+ # --- Helper Functions (Unchanged) ---
144
  def _strip_exact_match_answer(text: Any) -> str:
145
+ # ... (Your original _strip_exact_match_answer function)
146
  if not isinstance(text, str): text = str(text)
147
  text_lower_check = text.lower()
148
  if text_lower_check.startswith("final answer:"):
 
160
  return text.strip()
161
 
162
  def _is_full_url(url_string: str) -> bool:
163
+ # ... (Your original _is_full_url function)
164
  try: result = urlparse(url_string); return all([result.scheme, result.netloc])
165
  except ValueError: return False
166
 
167
  def _is_youtube_url(url: str) -> bool:
168
+ # ... (Your original _is_youtube_url function)
169
  parsed_url = urlparse(url)
170
  return parsed_url.netloc.lower().endswith(("youtube.com", "youtu.be"))
171
 
172
  def _download_file(file_identifier: str, task_id_for_file: Optional[str] = None) -> str:
173
+ # ... (Your original _download_file function - unchanged) ...
174
  os.makedirs(LOCAL_FILE_STORE_PATH, exist_ok=True)
175
  logger.debug(f"Download request: '{file_identifier}', task_id: {task_id_for_file}")
176
  original_filename = os.path.basename(urlparse(file_identifier).path) if _is_full_url(file_identifier) else os.path.basename(file_identifier)
 
251
 
252
  if effective_save_path != tentative_local_path and os.path.exists(effective_save_path) and os.path.getsize(effective_save_path) > 0:
253
  logger.info(f"Cached file (CD name): {effective_save_path}"); return effective_save_path
254
+ with open(effective_save_path, "wb") as f_download:
255
+ for chunk in r.iter_content(chunk_size=1024*1024): f_download.write(chunk)
256
  logger.info(f"File downloaded to {effective_save_path}"); return effective_save_path
257
  except requests.exceptions.HTTPError as e:
258
  err_msg = f"HTTP {e.response.status_code} for {file_url_to_try}. Detail: {e.response.text[:100]}"
 
260
  except Exception as e:
261
  logger.error(f"Download error for {file_url_to_try}: {e}", exc_info=True); return f"Error: {str(e)[:100]}"
262
 
263
+ # --- Tool Function Definitions ---
264
  READ_PDF_TOOL_DESC = "Reads text content from a PDF file. Input: JSON '{\"file_identifier\": \"FILENAME_OR_URL\", \"task_id\": \"TASK_ID_IF_GAIA_FILENAME_ONLY\"}'. Returns extracted text."
265
  @lc_tool_decorator(description=READ_PDF_TOOL_DESC)
266
  def read_pdf_tool(action_input_json_str: str) -> str:
 
272
  path = _download_file(file_id, task_id)
273
  if path.startswith("Error:"): return path
274
  try:
275
+ text_content = "";
276
+ with open(path, "rb") as f_pdf:
277
+ reader = PdfReader(f_pdf)
278
  if reader.is_encrypted:
279
  try: reader.decrypt('')
280
  except: return f"Error: PDF '{path}' encrypted."
281
  for page_num in range(len(reader.pages)):
282
  page = reader.pages[page_num]
283
+ text_content += page.extract_text() + "\n\n"
284
+ return text_content[:40000]
285
  except Exception as e: return f"Error reading PDF '{path}': {e}"
286
 
 
287
  OCR_IMAGE_TOOL_DESC = "Extracts text from an image using OCR. Input: JSON '{\"file_identifier\": \"FILENAME_OR_URL\", \"task_id\": \"TASK_ID_IF_GAIA_FILENAME_ONLY\"}'. Returns extracted text."
288
  @lc_tool_decorator(description=OCR_IMAGE_TOOL_DESC)
289
  def ocr_image_tool(action_input_json_str: str) -> str:
 
314
  try: result = WHISPER_MODEL.transcribe(path, fp16=False); return result["text"][:40000] # type: ignore
315
  except Exception as e: logger.error(f"Whisper error on '{path}': {e}", exc_info=True); return f"Error transcribing '{path}': {e}"
316
 
 
317
  DIRECT_MULTIMODAL_GEMINI_TOOL_DESC = (
318
  "Processes an image file (URL or local path) along with a text prompt using a Gemini multimodal model (gemini-2.0-flash-exp) "
319
+ "for tasks like image description, Q&A about the image, or text generation based on the image. "
320
+ "Input: JSON '{\"file_identifier\": \"IMAGE_FILENAME_OR_URL\", \"text_prompt\": \"Your question or instruction.\", \"task_id\": \"TASK_ID\" (optional)}'. "
321
  "Returns the model's text response."
322
  )
323
  @lc_tool_decorator(description=DIRECT_MULTIMODAL_GEMINI_TOOL_DESC)
324
  def direct_multimodal_gemini_tool(action_input_json_str: str) -> str:
325
+ # ... (Implementation from previous response)
326
+ global google_genai_client
327
+ if not google_genai_client: return "Error: google-genai SDK client not initialized."
328
+ if not PIL_TESSERACT_AVAILABLE : return "Error: Pillow (PIL) library not available."
 
 
329
  try:
330
  data = json.loads(action_input_json_str)
331
  file_identifier = data.get("file_identifier")
332
+ text_prompt = data.get("text_prompt", "Describe this image.")
333
+ task_id = data.get("task_id")
334
+ if not file_identifier: return "Error: 'file_identifier' for image missing."
335
+ logger.info(f"Direct Multimodal Tool: Image '{file_identifier}', Prompt '{text_prompt}'")
 
 
 
 
 
336
  local_image_path = _download_file(file_identifier, task_id)
337
+ if local_image_path.startswith("Error:"): return f"Error downloading for Direct MM Tool: {local_image_path}"
 
 
 
338
  try:
339
  pil_image = Image.open(local_image_path)
340
+ except Exception as e_img_open: return f"Error opening image {local_image_path}: {str(e_img_open)}"
 
 
 
 
 
341
  response = google_genai_client.models.generate_content(
342
+ model=GEMINI_FLASH_MULTIMODAL_MODEL_NAME, contents=[pil_image, text_prompt]
 
343
  )
344
+ logger.info(f"Direct Multimodal Tool: Response from {GEMINI_FLASH_MULTIMODAL_MODEL_NAME} received.")
345
+ return response.text[:40000]
346
+ except json.JSONDecodeError as e_json_mm: return f"Error parsing JSON for Direct MM Tool: {str(e_json_mm)}. Input: {action_input_json_str}"
347
+ except Exception as e_tool_mm:
348
+ logger.error(f"Error in direct_multimodal_gemini_tool: {e_tool_mm}", exc_info=True)
349
+ return f"Error executing Direct Multimodal Tool: {str(e_tool_mm)}"
350
+
351
+ # --- Agent Prompts (Unchanged) ---
352
+ LANGGRAPH_PROMPT_TEMPLATE_STR = """You are a highly intelligent agent for the GAIA benchmark.
353
+ Your goal is to provide an EXACT MATCH final answer. No conversational text, explanations, or markdown unless explicitly part of the answer.
354
+ TOOLS:
355
+ You have access to the following tools. Use them if necessary.
356
+ {tools}
357
+ TOOL USAGE:
358
+ - To use a tool, your response must include a `tool_calls` attribute in the AIMessage. Each tool call should be a dictionary with "name", "args" (a dictionary of arguments), and "id".
359
+ - For file tools ('read_pdf_tool', 'ocr_image_tool', 'transcribe_audio_tool', 'direct_multimodal_gemini_tool'): `args` must contain 'file_identifier' (filename/URL) and 'task_id' (if GAIA file). For 'direct_multimodal_gemini_tool', also include 'text_prompt'.
360
+ - 'web_search': `args` is like '{{"query": "search query"}}'.
361
+ - 'python_repl': `args` is like '{{"command": "python code string"}}'. Use print() for output.
362
+ RESPONSE FORMAT:
363
+ Final AIMessage should contain ONLY the answer in 'content' and NO 'tool_calls'. If using tools, 'content' can be thought process, with 'tool_calls'.
364
+ Begin!
365
+ Current Task Details (including Task ID and any associated files):
366
+ {input}"""
367
+
368
+ REACT_PROMPT_TEMPLATE_STR = """You are a highly intelligent agent for the GAIA benchmark.
369
+ Goal: EXACT MATCH answer. No extra text/markdown.
370
+ Tools: {tools}
371
+ Process: Question -> Thought -> Action (ONE of [{tool_names}]) -> Action Input -> Observation -> Thought ... -> Final Answer: [exact answer]
372
+ Tool Inputs:
373
+ - web_search: Your search query string.
374
+ - python_repl: Python code string. Use print(). For Excel/CSV, use pandas: import pandas as pd; df = pd.read_excel('./Data/TASKID_filename.xlsx'); print(df.head())
375
+ - read_pdf_tool, ocr_image_tool, transcribe_audio_tool: JSON string like '{{"file_identifier": "FILENAME_OR_URL", "task_id": "CURRENT_TASK_ID_IF_FILENAME"}}'.
376
+ - direct_multimodal_gemini_tool: JSON string like '{{"file_identifier": "IMAGE_FILENAME_OR_URL", "text_prompt": "Your prompt for the image.", "task_id": "TASK_ID_IF_GAIA_FILENAME"}}'.
377
+ If tool fails or info missing, Final Answer: N/A. Do NOT use unlisted tools.
378
+ Begin!
379
+ {input}
380
+ Thought:{agent_scratchpad}"""
381
 
382
 
 
 
 
383
  # --- Agent Initialization and Response Logic ---
384
  def initialize_agent_and_tools(force_reinit=False):
385
+ global AGENT_INSTANCE, TOOLS, LLM_INSTANCE, LANGGRAPH_FLAVOR_AVAILABLE, LG_StateGraph, LG_ToolExecutor_Class, LG_END, LG_ToolInvocation, add_messages, MemorySaver_Class, LANGGRAPH_MEMORY_SAVER, google_genai_client
386
  if AGENT_INSTANCE and not force_reinit: logger.info("Agent already initialized."); return
387
  logger.info("Initializing agent and tools...")
388
  if not GOOGLE_API_KEY: raise ValueError("GOOGLE_API_KEY not set for LangChain LLM.")
389
 
390
+ # CORRECTED ChatGoogleGenerativeAI initialization
391
+ # The safety_settings should be a dictionary where keys are HarmCategory enums and values are HarmBlockThreshold enums.
392
+ # Or a list of dicts: [{"category": HarmCategory.XYZ, "threshold": HarmBlockThreshold.ABC}, ...]
393
+ # Let's use the dictionary format as it's cleaner and suggested by LangChain's type hints.
394
+ llm_safety_settings_map = {
395
+ HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
396
+ HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
397
+ HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
398
+ HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
399
+ }
400
+ # If the above still causes issues due to Pydantic's strict enum key handling,
401
+ # an alternative is to pass it as a list of SafetySetting objects from google.generativeai.types,
402
+ # but ChatGoogleGenerativeAI's Pydantic model might not directly accept that.
403
+ # The most robust way if direct enums as keys fail is to convert enums to their string values for the dict if Pydantic demands.
404
+ # However, LangChain *should* handle the direct enums.
405
+ # The error `Input should be 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 or 11 [type=enum, input_value=<HarmCategory...>]`
406
+ # suggests that for the *keys* of the safety_settings dict, it might be expecting the integer value of the HarmCategory enum.
407
+ # This is unusual. Let's try with the most standard documented way first (direct enums).
408
+ # If that fails, the next step would be to try string names for keys.
409
+
410
  try:
411
+ LLM_INSTANCE = ChatGoogleGenerativeAI(
412
+ model=GEMINI_MODEL_NAME,
413
+ google_api_key=GOOGLE_API_KEY,
414
+ temperature=0.0,
415
+ safety_settings=llm_safety_settings_map, # Pass the map
416
+ timeout=120, # Corrected: 'timeout' not 'request_timeout'
417
+ convert_system_message_to_human=True
418
+ )
419
  logger.info(f"LangChain LLM (Planner) initialized: {GEMINI_MODEL_NAME}")
420
+ except Exception as e:
421
+ logger.error(f"LangChain LLM init failed: {e}", exc_info=True)
422
+ raise
423
 
424
  TOOLS = []
425
  if PYPDF2_AVAILABLE: TOOLS.append(read_pdf_tool)
426
  if PIL_TESSERACT_AVAILABLE: TOOLS.append(ocr_image_tool)
427
  if WHISPER_AVAILABLE: TOOLS.append(transcribe_audio_tool)
428
+ if google_genai_client and PIL_TESSERACT_AVAILABLE: TOOLS.append(direct_multimodal_gemini_tool); logger.info("Added 'direct_multimodal_gemini_tool'.")
429
+ else: logger.warning("'direct_multimodal_gemini_tool' NOT added (client or PIL missing).")
 
 
 
 
 
 
430
  try: search_tool = DuckDuckGoSearchRun(name="web_search"); search_tool.description = "Web search. Input: query."; TOOLS.append(search_tool)
431
  except Exception as e: logger.warning(f"DuckDuckGoSearchRun init failed: {e}")
432
  try: python_repl = PythonREPLTool(name="python_repl"); python_repl.description = "Python REPL. print() for output."; TOOLS.append(python_repl)
433
  except Exception as e: logger.warning(f"PythonREPLTool init failed: {e}")
434
  logger.info(f"Final tools list for agent: {[t.name for t in TOOLS]}")
435
 
436
+ if LANGGRAPH_FLAVOR_AVAILABLE and all([LG_StateGraph, LG_ToolExecutor_Class, LG_END, LLM_INSTANCE, LG_ToolInvocation, add_messages]):
 
 
437
  if not LANGGRAPH_MEMORY_SAVER and MemorySaver_Class: LANGGRAPH_MEMORY_SAVER = MemorySaver_Class(); logger.info("LangGraph MemorySaver initialized.")
438
  try:
439
+ logger.info(f"Attempting LangGraph init (Tool Executor type: {LG_ToolExecutor_Class.__name__ if LG_ToolExecutor_Class else 'None'})")
440
  _TypedDict = getattr(__import__('typing_extensions'), 'TypedDict', dict)
441
+ class AgentState(_TypedDict): input: str; messages: Annotated[List[Any], add_messages]
442
 
443
+ prompt_content_lg_init = LANGGRAPH_PROMPT_TEMPLATE_STR.format(
444
+ tools="\n".join([f"- {t.name}: {t.description}" for t in TOOLS]), input="{input}"
 
445
  )
446
  def agent_node(state: AgentState):
447
+ current_input = state.get('input', '')
448
+ formatted_system_prompt = prompt_content_lg_init.replace("{input}", current_input)
449
+ messages_for_llm = [SystemMessage(content=formatted_system_prompt)] + state.get('messages', [])
450
+ bound_llm = LLM_INSTANCE.bind_tools(TOOLS) # type: ignore
451
+ response = bound_llm.invoke(messages_for_llm)
452
+ return {"messages": [response]}
453
+
454
+ if not LG_ToolExecutor_Class: raise ValueError("LG_ToolExecutor_Class is None for LangGraph.")
455
+ # Instantiate ToolNode if that's what was imported
456
+ tool_executor_instance_lg = LG_ToolExecutor_Class(tools=TOOLS)
457
+
458
+
459
+ def tool_node(state: AgentState): # This function uses the instantiated tool_executor_instance_lg
460
+ last_msg = state['messages'][-1] if state.get('messages') and isinstance(state['messages'][-1], AIMessage) else None
461
+ if not last_msg or not last_msg.tool_calls: return {"messages": []}
462
+ # ToolNode expects a list of ToolInvocations if invoked directly,
463
+ # or handles it if part of a graph that structures it.
464
+ # The LangGraph prebuilt react agent often passes the AIMessage directly to ToolNode.
465
+ # Let's assume the ToolNode can handle a list of tool calls from the AIMessage.
466
+ # If ToolNode expects a single ToolInvocation, this loop needs adjustment.
467
+ # However, the standard ToolNode takes the AIMessage and iterates internally.
468
+ # The issue might be if `tool_executor_instance_lg` is not directly callable or its `invoke` expects different input.
469
+ # For now, let's assume the standard pattern where ToolNode handles the AIMessage's tool_calls.
470
+ # A simpler way to use ToolNode is often just to pass it to add_node if it's a runnable.
471
+ # tool_executor_instance_lg.invoke(last_msg.tool_calls) might be what's needed if it takes raw calls
472
+
473
+ # The following is more aligned if tool_executor_instance_lg is the older ToolExecutor
474
+ # or if ToolNode is used within a manual iteration like this:
475
+ tool_results = []
476
+ for tc in last_msg.tool_calls:
477
+ name, args, tc_id = tc.get('name'), tc.get('args'), tc.get('id')
478
+ if not all([name, isinstance(args, dict), tc_id]):
479
+ # ... error handling ...
480
+ err_msg=f"Invalid tool_call: {tc}"; logger.error(err_msg)
481
+ tool_results.append(ToolMessage(f"Error: {err_msg}", tool_call_id=tc_id or "error_id", name=name or "error_tool"))
482
  continue
483
  try:
484
+ logger.info(f"LG Tool Invoking: '{name}' with {args} (ID: {tc_id})")
485
+ tool_invocation_obj_lg = LG_ToolInvocation(tool=name, tool_input=args) # type: ignore
486
+ # This invoke is on the tool_executor_instance_lg (which could be ToolNode or older ToolExecutor)
487
+ output_lg = tool_executor_instance_lg.invoke(tool_invocation_obj_lg) # This might be the issue if ToolNode expects direct runnable call
488
+ tool_results.append(ToolMessage(content=str(output_lg), tool_call_id=tc_id, name=name))
489
  except Exception as e_tool_node_lg:
490
+ logger.error(f"LG Tool Error ('{name}'): {e_tool_node_lg}", exc_info=True)
491
+ tool_results.append(ToolMessage(content=f"Error for tool {name}: {str(e_tool_node_lg)}", tool_call_id=tc_id, name=name))
492
+ return {"messages": tool_results}
493
+
494
 
495
  workflow_lg = LG_StateGraph(AgentState) # type: ignore
496
  workflow_lg.add_node("agent", agent_node)
497
+ # If LG_ToolExecutor_Class is ToolNode, tool_node_instance is runnable
498
+ # workflow_lg.add_node("tools", tool_executor_instance_lg) # Preferred way if ToolNode is a runnable
499
+ workflow_lg.add_node("tools", tool_node) # Keep custom tool_node for now
500
  workflow_lg.set_entry_point("agent")
501
  def should_continue_lg(state: AgentState): return "tools" if state['messages'][-1].tool_calls else LG_END
502
  workflow_lg.add_conditional_edges("agent", should_continue_lg, {"tools": "tools", LG_END: LG_END}) # type: ignore
 
525
  if not AGENT_INSTANCE: raise RuntimeError("CRITICAL: Agent initialization completely failed.")
526
  logger.info(f"Agent init finished. Active agent type: {type(AGENT_INSTANCE).__name__}")
527
 
528
+ # --- get_agent_response, construct_prompt_for_agent, run_and_submit_all (Unchanged) ---
 
 
529
  def get_agent_response(prompt: str, task_id: Optional[str]=None, thread_id: Optional[str]=None) -> str:
530
+ # ... (Your original get_agent_response logic - unchanged) ...
531
  global AGENT_INSTANCE, LLM_INSTANCE
532
  thread_id_to_use = thread_id or (f"gaia_task_{task_id}" if task_id else hashlib.md5(prompt.encode()).hexdigest()[:8])
533
  if not AGENT_INSTANCE or not LLM_INSTANCE:
 
543
  logger.debug(f"Using LangGraph agent (Memory: {LANGGRAPH_MEMORY_SAVER is not None}) for thread: {thread_id_to_use}")
544
  initial_messages_lg_get = []
545
  input_for_lg_get = {"input": prompt, "messages": initial_messages_lg_get}
546
+ final_state_lg_get = AGENT_INSTANCE.invoke(input_for_lg_get, {"configurable": {"thread_id": thread_id_to_use}}) # type: ignore
547
  if not final_state_lg_get or 'messages' not in final_state_lg_get or not final_state_lg_get['messages']:
548
  logger.error("LangGraph: No final state/messages."); return "[ERROR] LangGraph: No final state/messages."
549
  for message_item_lg_get in reversed(final_state_lg_get['messages']):
 
562
  return f"[ERROR] Agent execution failed: {str(e_agent_run_get)[:150]}"
563
 
564
  def construct_prompt_for_agent(q: Dict[str,Any]) -> str:
565
+ # ... (Your original construct_prompt_for_agent logic - unchanged) ...
566
  tid,q_str=q.get("task_id","N/A"),q.get("question",""); files=q.get("files",[])
567
  files_info = ("\nFiles:\n"+"\n".join([f"- {f} (task_id:{tid})"for f in files])) if files else ""
568
  level = f"\nLevel:{q.get('level')}" if q.get('level') else ""
 
573
  global AGENT_INSTANCE
574
  space_id = os.getenv("SPACE_ID")
575
  username_for_submission = None
 
576
  if profile and hasattr(profile, 'username') and profile.username:
577
  username_for_submission = profile.username
578
  logger.info(f"Username from OAuth profile: {username_for_submission}")
579
  else:
580
  logger.warning("OAuth profile not available or username missing.")
581
  return "Hugging Face login required. Please use the login button and try again.", None
 
582
  if AGENT_INSTANCE is None:
583
  try: logger.info("Agent not pre-initialized. Initializing for run..."); initialize_agent_and_tools()
584
  except Exception as e: return f"Agent on-demand initialization failed: {e}", None
585
  if AGENT_INSTANCE is None: return "Agent is still None after on-demand init.", None
 
586
  agent_code_url_run=f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "local_dev_run"
587
  questions_url_run,submit_url_run=f"{DEFAULT_API_URL}/questions",f"{DEFAULT_API_URL}/submit"
588
  auth_headers_run={"Authorization":f"Bearer {HUGGINGFACE_TOKEN}"} if HUGGINGFACE_TOKEN else {}
 
592
  if not questions_data_run or not isinstance(questions_data_run,list):logger.error(f"Invalid questions data: {questions_data_run}");return "Fetched questions_data invalid.",None
593
  logger.info(f"Fetched {len(questions_data_run)} questions.")
594
  except Exception as e:logger.error(f"Fetch questions error: {e}",exc_info=True);return f"Fetch questions error:{e}",None
 
595
  results_log_run,answers_payload_run=[],[]
596
  logger.info(f"Running agent on {len(questions_data_run)} questions for user '{username_for_submission}'...")
597
  for i,item_run in enumerate(questions_data_run):
 
607
  logger.error(f"Agent error task {task_id_run}:{e}",exc_info=True);error_answer_run=f"AGENT ERROR:{str(e)[:100]}"
608
  answers_payload_run.append({"task_id":task_id_run,"submitted_answer":"N/A [AGENT_ERROR]"})
609
  results_log_run.append({"Task ID":task_id_run,"Question":question_text_run,"Full Agent Prompt":prompt_run,"Raw Agent Output":error_answer_run,"Submitted Answer":"N/A [AGENT_ERROR]"})
 
610
  if not answers_payload_run:return "Agent produced no answers.",pd.DataFrame(results_log_run)
611
  submission_payload_run={"username":username_for_submission.strip(),"agent_code":agent_code_url_run,"answers":answers_payload_run}
612
  logger.info(f"Submitting {len(answers_payload_run)} answers to {submit_url_run} for user '{username_for_submission}'...")
 
619
  error_http_run=f"HTTP {e.response.status_code}. Detail:{e.response.text[:200]}"; logger.error(f"Submit Fail:{error_http_run}",exc_info=True); return f"Submit Fail:{error_http_run}",pd.DataFrame(results_log_run)
620
  except Exception as e:logger.error(f"Submit Fail unexpected:{e}",exc_info=True);return f"Submit Fail:{str(e)[:100]}",pd.DataFrame(results_log_run)
621
 
622
+ # --- Build Gradio Interface ---
 
623
  with gr.Blocks(css=".gradio-container {max-width:1280px !important;margin:auto !important;}",theme=gr.themes.Soft()) as demo:
 
624
  gr.Markdown("# GAIA Agent Challenge Runner v7 (OAuth for Username)")
625
  gr.Markdown(f"""**Instructions:**
626
  1. **Login with Hugging Face** using the button below. Your HF username will be used for submission.
 
635
  gr.LoginButton()
636
  run_button = gr.Button("Run Evaluation & Submit All Answers")
637
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=7, interactive=False)
638
+ results_table = gr.DataFrame(label="Q&A Log", headers=["Task ID","Question","Prompt","Raw","Submitted"], wrap=True) # Removed height
639
 
640
  run_button.click(fn=run_and_submit_all, outputs=[status_output,results_table], api_name="run_evaluation")
641
 
642
  def update_ui_on_load_fn_within_context():
 
643
  global missing_vars_startup_list_global, agent_pre_init_status_msg_global
644
  secrets_msg_md = ""
645
  if missing_vars_startup_list_global:
 
652
  if env_issues: secrets_msg_md += f"<br/><font color='orange'>**Tool Deps Missing:** {', '.join(env_issues)}.</font>"
653
  current_status_md = agent_pre_init_status_msg_global
654
  if not LANGGRAPH_FLAVOR_AVAILABLE and "LangGraph" not in current_status_md:
655
+ current_status_md += f" (LangGraph core components not fully loaded: LG_ToolExecutor_Class is {type(LG_ToolExecutor_Class).__name__ if LG_ToolExecutor_Class else 'None'}, ReAct fallback.)"
656
+ elif LANGGRAPH_FLAVOR_AVAILABLE and "LangGraph" not in current_status_md:
657
+ current_status_md += f" (LangGraph ready with {type(LG_ToolExecutor_Class).__name__ if LG_ToolExecutor_Class else 'UnknownExecutor'}.)"
658
  return { agent_status_display: gr.Markdown(value=current_status_md),
659
  missing_secrets_display: gr.Markdown(value=secrets_msg_md) }
660
 
661
  demo.load(update_ui_on_load_fn_within_context, [], [agent_status_display, missing_secrets_display])
662
 
663
  if __name__ == "__main__":
664
+ logger.info(f"Application starting up (v7 with Pydantic & LangGraph fixes)...") # Updated version in log
 
665
  if not PYPDF2_AVAILABLE: logger.warning("PyPDF2 (PDF tool) NOT AVAILABLE.")
666
+ if not PIL_TESSERACT_AVAILABLE: logger.warning("Pillow/Pytesseract (OCR tool) NOT AVAILABLE.")
667
  if not WHISPER_AVAILABLE: logger.warning("Whisper (Audio tool) NOT AVAILABLE.")
668
+ if LANGGRAPH_FLAVOR_AVAILABLE: logger.info(f"Core LangGraph components (StateGraph, END, {type(LG_ToolExecutor_Class).__name__ if LG_ToolExecutor_Class else 'FailedExecutor'}) loaded.")
669
+ else: logger.warning("Core LangGraph FAILED import or essential component (ToolExecutor/Node) missing. ReAct fallback. Check requirements & Space build logs.")
670
 
671
  missing_vars_startup_list_global.clear()
672
  if not GOOGLE_API_KEY: missing_vars_startup_list_global.append("GOOGLE_API_KEY")
 
674
 
675
  try:
676
  logger.info("Pre-initializing agent...")
677
+ initialize_agent_and_tools()
678
  if AGENT_INSTANCE:
679
  agent_type_name = type(AGENT_INSTANCE).__name__
680
  agent_pre_init_status_msg_global = f"Agent Pre-initialized: **{agent_type_name}**."
681
+ if LANGGRAPH_FLAVOR_AVAILABLE and ("StateGraph" in agent_type_name or "CompiledGraph" in agent_type_name) :
682
+ lg_executor_display_name = type(LG_ToolExecutor_Class).__name__ if LG_ToolExecutor_Class else "UnknownExecutor"
683
+ agent_pre_init_status_msg_global = f"Agent Pre-initialized: **LangGraph** (Executor: {lg_executor_display_name}, Memory: {LANGGRAPH_MEMORY_SAVER is not None})."
684
  else: agent_pre_init_status_msg_global = "Agent pre-init FAILED (AGENT_INSTANCE is None)."
685
  logger.info(agent_pre_init_status_msg_global.replace("**",""))
686
  except Exception as e:
687
+ agent_pre_init_status_msg_global = f"Agent pre-init CRASHED: {str(e)[:100]}." # Show first 100 chars of error
688
+ logger.critical(f"Agent pre-init CRASHED: {e}", exc_info=True) # Full traceback to logs
689
 
690
  logger.info(f"Space ID: {os.getenv('SPACE_ID', 'Not Set')}")
691
  logger.info("Gradio Interface launching...")