Luigi D'Addona commited on
Commit
e0568e5
·
1 Parent(s): 93c3b2a

aggiunto tool analyze_png_image

Browse files
Files changed (2) hide show
  1. agent.py +4 -2
  2. tools.py +45 -0
agent.py CHANGED
@@ -14,7 +14,8 @@ from langchain_google_genai import ChatGoogleGenerativeAI
14
 
15
  # Local imports
16
  from tools import get_search_tool, get_tavily_search_tool, get_wikipedia_tool, wikipedia_search, wikipedia_search_3,\
17
- execute_python_code_from_file, download_taskid_file, analyze_excel_file, get_analyze_mp3_tool, arxiv_search
 
18
 
19
  # Nota: per i test in locale si usa il .env
20
  # su HuggingFace invece si usano le variabili definite in Settings/"Variables and secrets"
@@ -59,8 +60,9 @@ chat = ChatGoogleGenerativeAI(
59
  search_tool = get_tavily_search_tool()
60
  #wikipedia_tool = get_wikipedia_tool()
61
  analyze_mp3_tool = get_analyze_mp3_tool(chat)
 
62
 
63
- tools = [search_tool, wikipedia_search_3, execute_python_code_from_file, download_taskid_file, analyze_excel_file, analyze_mp3_tool, arxiv_search]
64
 
65
  # Bind tools to the model
66
  chat_with_tools = chat.bind_tools(tools)
 
14
 
15
  # Local imports
16
  from tools import get_search_tool, get_tavily_search_tool, get_wikipedia_tool, wikipedia_search, wikipedia_search_3,\
17
+ execute_python_code_from_file, download_taskid_file, analyze_excel_file, get_analyze_mp3_tool,\
18
+ get_analyze_image_tool, arxiv_search
19
 
20
  # Nota: per i test in locale si usa il .env
21
  # su HuggingFace invece si usano le variabili definite in Settings/"Variables and secrets"
 
60
  search_tool = get_tavily_search_tool()
61
  #wikipedia_tool = get_wikipedia_tool()
62
  analyze_mp3_tool = get_analyze_mp3_tool(chat)
63
+ analyze_png_tool = get_analyze_image_tool(chat)
64
 
65
+ tools = [search_tool, wikipedia_search_3, execute_python_code_from_file, download_taskid_file, analyze_excel_file, analyze_mp3_tool, analyze_png_tool, arxiv_search]
66
 
67
  # Bind tools to the model
68
  chat_with_tools = chat.bind_tools(tools)
tools.py CHANGED
@@ -256,6 +256,51 @@ def get_analyze_mp3_tool(llm):
256
  return analyze_mp3_file
257
 
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  @tool
260
  def arxiv_search(query: str) -> str:
261
  """Search Arxiv for a query and return maximum 3 result.
 
256
  return analyze_mp3_file
257
 
258
 
259
+ def get_analyze_image_tool(llm):
260
+ @tool
261
+ def analyze_png_image(image_path: str) -> str:
262
+ """
263
+ Analyzes a PNG image and returns a detailed description of its content.
264
+ This tool requires an LLM capable of processing images, such as Gemini 1.5 Pro or Gemini 2.0 Flash.
265
+ """
266
+ try:
267
+ # Read image and encode as base64
268
+ with open(image_path, "rb") as image_file:
269
+ image_bytes = image_file.read()
270
+
271
+ image_base64 = base64.b64encode(image_bytes).decode("utf-8")
272
+
273
+ # Prepare the prompt including the base64 image data
274
+ message = [
275
+ HumanMessage(
276
+ content=[
277
+ {
278
+ "type": "text",
279
+ "text": (
280
+ "Provide a very detailed description of the content of this image. "
281
+ "Focus on objects, people, actions, text, and overall scene context. "
282
+ "Be as comprehensive as possible."
283
+ ),
284
+ },
285
+ {
286
+ "type": "image_url",
287
+ "image_url": {"url": f"data:image/png;base64,{image_base64}"},
288
+ },
289
+ ]
290
+ )
291
+ ]
292
+
293
+ # Call the vision-capable model
294
+ response = llm.invoke(message)
295
+
296
+ return response.content.strip()
297
+ except Exception as e:
298
+ print("Error analyzing image file:{} - {}".format(image_path, e))
299
+ return ""
300
+
301
+ return analyze_png_image
302
+
303
+
304
  @tool
305
  def arxiv_search(query: str) -> str:
306
  """Search Arxiv for a query and return maximum 3 result.