AlexTrinityBlock commited on
Commit
7a0b5ad
·
1 Parent(s): 01faebd

feat(tools): add ocr_reader and list_files tools to expand agent capabilities

Browse files

- Add ocr_reader tool to extract text from image and photo attachments
- Add list_files tool for directory listing functionality
- Update file_downloader with User-Agent headers to avoid request blocks
- Extend supervisor prompt with image handling and insufficient information response

agent/agent.py CHANGED
@@ -5,6 +5,8 @@ from langchain.agents import create_agent
5
  from langchain_core.messages import HumanMessage
6
  from agent.tools.math_solver import math_solver
7
  from agent.tools.file_downloader import file_downloader
 
 
8
 
9
  from agent.agents.websearchagents import web_search_agents
10
 
@@ -19,18 +21,23 @@ def supervisor_agent():
19
  return create_agent(
20
  model="google_genai:gemini-3-flash-preview",
21
  # tools=[math_solver, websearch_agent, web_search_agents],
22
- tools=[math_solver, web_search_agents, file_downloader],
23
  system_prompt=(
24
  f"You are a supervisor agent. "
25
  f"Current time is: {datetime.now(timezone.utc).isoformat()}. "
26
  f"Your memory are out of date. "
27
  f"For any math or calculation questions, use the math_solver tool for check, "
28
- f"the accurate is the most important."
29
  f"All questions that need real-time, must use the web_search_agents tool "
30
  f"to get a concise and accurate final answer. "
 
 
31
  f"Once you have found the answer, respond immediately. "
32
  f"Do NOT continue searching or verifying unnecessarily — "
33
- f"you have a limited number of action steps and must avoid exceeding them."
 
 
 
34
  ),
35
  )
36
 
@@ -88,17 +95,18 @@ def run(query: str, file_url: str | None = None, max_retries: int = 3) -> str:
88
 
89
 
90
  if __name__ == "__main__":
91
- run(input("Query:"))
92
- # agent = supervisor_agent()
93
- # chat_history: list = []
94
- # while True:
95
- # query = input("\nYou: ")
96
- # if query.lower() in ("exit", "quit"):
97
- # break
98
- # chat_history.append(HumanMessage(content=query))
99
- # result = agent.invoke({"messages": chat_history})
100
- # chat_history = result["messages"]
101
- # content = chat_history[-1].content
102
- # if isinstance(content, list):
103
- # content = content[0].get("text", "")
104
- # print(f"Agent: {content}")
 
 
5
  from langchain_core.messages import HumanMessage
6
  from agent.tools.math_solver import math_solver
7
  from agent.tools.file_downloader import file_downloader
8
+ from agent.tools.ocr_reader import ocr_reader
9
+ from agent.tools.list_files import list_files
10
 
11
  from agent.agents.websearchagents import web_search_agents
12
 
 
21
  return create_agent(
22
  model="google_genai:gemini-3-flash-preview",
23
  # tools=[math_solver, websearch_agent, web_search_agents],
24
+ tools=[math_solver, web_search_agents, file_downloader, ocr_reader, list_files],
25
  system_prompt=(
26
  f"You are a supervisor agent. "
27
  f"Current time is: {datetime.now(timezone.utc).isoformat()}. "
28
  f"Your memory are out of date. "
29
  f"For any math or calculation questions, use the math_solver tool for check, "
30
+ f"the accurate is the most important. "
31
  f"All questions that need real-time, must use the web_search_agents tool "
32
  f"to get a concise and accurate final answer. "
33
+ f"If an image or photo file is attached, download and use the ocr_reader tool "
34
+ f"to extract and describe the content before answering. "
35
  f"Once you have found the answer, respond immediately. "
36
  f"Do NOT continue searching or verifying unnecessarily — "
37
+ f"you have a limited number of action steps and must avoid exceeding them. "
38
+ f"If you do not have enough information to answer the question "
39
+ f"and no tool can help, respond with: "
40
+ f"'Insufficient information to provide an answer.'"
41
  ),
42
  )
43
 
 
95
 
96
 
97
  if __name__ == "__main__":
98
+ # run(input("Query:"))
99
+ ######################
100
+ agent = supervisor_agent()
101
+ chat_history: list = []
102
+ while True:
103
+ query = input("\nYou: ")
104
+ if query.lower() in ("exit", "quit"):
105
+ break
106
+ chat_history.append(HumanMessage(content=query))
107
+ result = agent.invoke({"messages": chat_history})
108
+ chat_history = result["messages"]
109
+ content = chat_history[-1].content
110
+ if isinstance(content, list):
111
+ content = content[0].get("text", "")
112
+ print(f"Agent: {content}")
agent/tools/file_downloader.py CHANGED
@@ -22,7 +22,10 @@ def file_downloader(url: str) -> str:
22
  filename = url.rstrip("/").split("/")[-1].split("?")[0] or "downloaded_file"
23
  dest = WORKSPACE_DIR / filename
24
 
25
- with httpx.stream("GET", url, follow_redirects=True, timeout=60) as r:
 
 
 
26
  r.raise_for_status()
27
  with open(dest, "wb") as f:
28
  for chunk in r.iter_bytes():
 
22
  filename = url.rstrip("/").split("/")[-1].split("?")[0] or "downloaded_file"
23
  dest = WORKSPACE_DIR / filename
24
 
25
+ headers = {
26
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
27
+ }
28
+ with httpx.stream("GET", url, headers=headers, follow_redirects=True, timeout=60) as r:
29
  r.raise_for_status()
30
  with open(dest, "wb") as f:
31
  for chunk in r.iter_bytes():
agent/tools/list_files.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from colorama import Fore, Style # type: ignore[import]
4
+ from langchain_core.tools import tool
5
+
6
+ WORKSPACE_DIR = Path(__file__).resolve().parents[2] / "workspace"
7
+
8
+
9
+ @tool
10
+ def list_files() -> str:
11
+ """List all files in the workspace directory.
12
+
13
+ Returns:
14
+ A newline-separated list of filenames in the workspace directory,
15
+ or a message indicating the directory is empty or does not exist.
16
+ """
17
+ if not WORKSPACE_DIR.exists():
18
+ return f"Workspace directory does not exist: {WORKSPACE_DIR}"
19
+
20
+ files = [f.name for f in WORKSPACE_DIR.iterdir() if f.is_file()]
21
+
22
+ if not files:
23
+ return "Workspace directory is empty."
24
+
25
+ print(f"{Fore.BLUE}[Workspace] {len(files)} file(s) found{Style.RESET_ALL}")
26
+ return "\n".join(sorted(files))
agent/tools/ocr_reader.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ from pathlib import Path
3
+
4
+ from colorama import Fore, Style # type: ignore[import]
5
+ from langchain_core.messages import HumanMessage
6
+ from langchain_core.tools import tool
7
+ from langchain_google_genai import ChatGoogleGenerativeAI
8
+
9
+ # Workspace directory where downloaded files are stored
10
+ WORKSPACE_DIR = Path(__file__).resolve().parents[2] / "workspace"
11
+
12
+ # Supported image extensions
13
+ IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff"}
14
+
15
+ SYSTEM_PROMPT = (
16
+ "You are a precise OCR and image analysis assistant. "
17
+ "If the image is a photo or illustration, describe the content in detail "
18
+ "from left to right, top to bottom. "
19
+ "If the image contains a table, document, chessboard, or any structured data, "
20
+ "reproduce the full structure as a Markdown table. "
21
+ "Always be thorough and accurate."
22
+ )
23
+
24
+
25
+ def _encode_image(image_path: str) -> str:
26
+ """Read and base64-encode an image file."""
27
+ with open(image_path, "rb") as f:
28
+ return base64.b64encode(f.read()).decode("utf-8")
29
+
30
+
31
+ def _guess_mime_type(path: str) -> str:
32
+ """Return the MIME type based on file extension."""
33
+ ext = Path(path).suffix.lower()
34
+ mime_map = {
35
+ ".jpg": "image/jpeg",
36
+ ".jpeg": "image/jpeg",
37
+ ".png": "image/png",
38
+ ".gif": "image/gif",
39
+ ".bmp": "image/bmp",
40
+ ".webp": "image/webp",
41
+ ".tiff": "image/tiff",
42
+ }
43
+ return mime_map.get(ext, "image/jpeg")
44
+
45
+
46
+ @tool
47
+ def ocr_reader(filename: str, query: str = "") -> str:
48
+ """Read and extract content from an image file in the workspace directory.
49
+
50
+ The image must be located in the workspace directory (where file_downloader
51
+ saves files). Only provide the filename, not the full path.
52
+
53
+ For photos or illustrations, the content is described from left to right,
54
+ top to bottom in detail. For tables, documents, or chessboards, the full
55
+ structure is returned as a Markdown table.
56
+
57
+ Args:
58
+ filename: The filename of the image in the workspace directory.
59
+ Example: "photo.jpg", "table.png"
60
+ query: An optional question to ask about the image.
61
+ If provided, the model will focus on answering this question.
62
+ If omitted, the model performs general OCR / description.
63
+
64
+ Returns:
65
+ A string containing the extracted or described content of the image.
66
+ """
67
+ path = WORKSPACE_DIR / filename
68
+ if not path.exists():
69
+ return f"Error: file not found: {path}"
70
+
71
+ if path.suffix.lower() not in IMAGE_EXTENSIONS:
72
+ return f"Error: unsupported image format '{path.suffix}'. Supported: {IMAGE_EXTENSIONS}"
73
+
74
+ try:
75
+ print(
76
+ f"{Fore.YELLOW}[OCR] Reading image: {filename}"
77
+ f"{f' | Query: {query}' if query else ''}{Style.RESET_ALL}"
78
+ )
79
+
80
+ base64_image = _encode_image(str(path))
81
+ mime_type = _guess_mime_type(str(path))
82
+
83
+ llm = ChatGoogleGenerativeAI(model="gemini-3-flash-preview")
84
+
85
+ prompt = f"{SYSTEM_PROMPT}\n\nUser question: {query}" if query else SYSTEM_PROMPT
86
+
87
+ message = HumanMessage(
88
+ content=[
89
+ {"type": "text", "text": prompt},
90
+ {
91
+ "type": "image_url",
92
+ "image_url": {"url": f"data:{mime_type};base64,{base64_image}"},
93
+ },
94
+ ]
95
+ )
96
+
97
+ response = llm.invoke([message])
98
+ content = response.content
99
+ if isinstance(content, list):
100
+ result = "\n".join(
101
+ block.get("text", "") for block in content if isinstance(block, dict)
102
+ )
103
+ else:
104
+ result = str(content)
105
+ print(f"{Fore.YELLOW}[OCR] Result:\n{result}{Style.RESET_ALL}")
106
+ return result
107
+ except Exception as e:
108
+ return f"Error processing image: {e}"