Commit ·
7a0b5ad
1
Parent(s): 01faebd
feat(tools): add ocr_reader and list_files tools to expand agent capabilities
Browse files- Add ocr_reader tool to extract text from image and photo attachments
- Add list_files tool for directory listing functionality
- Update file_downloader with User-Agent headers to avoid request blocks
- Extend supervisor prompt with image handling and insufficient information response
- agent/agent.py +25 -17
- agent/tools/file_downloader.py +4 -1
- agent/tools/list_files.py +26 -0
- agent/tools/ocr_reader.py +108 -0
agent/agent.py
CHANGED
|
@@ -5,6 +5,8 @@ from langchain.agents import create_agent
|
|
| 5 |
from langchain_core.messages import HumanMessage
|
| 6 |
from agent.tools.math_solver import math_solver
|
| 7 |
from agent.tools.file_downloader import file_downloader
|
|
|
|
|
|
|
| 8 |
|
| 9 |
from agent.agents.websearchagents import web_search_agents
|
| 10 |
|
|
@@ -19,18 +21,23 @@ def supervisor_agent():
|
|
| 19 |
return create_agent(
|
| 20 |
model="google_genai:gemini-3-flash-preview",
|
| 21 |
# tools=[math_solver, websearch_agent, web_search_agents],
|
| 22 |
-
tools=[math_solver, web_search_agents, file_downloader],
|
| 23 |
system_prompt=(
|
| 24 |
f"You are a supervisor agent. "
|
| 25 |
f"Current time is: {datetime.now(timezone.utc).isoformat()}. "
|
| 26 |
f"Your memory are out of date. "
|
| 27 |
f"For any math or calculation questions, use the math_solver tool for check, "
|
| 28 |
-
f"the accurate is the most important."
|
| 29 |
f"All questions that need real-time, must use the web_search_agents tool "
|
| 30 |
f"to get a concise and accurate final answer. "
|
|
|
|
|
|
|
| 31 |
f"Once you have found the answer, respond immediately. "
|
| 32 |
f"Do NOT continue searching or verifying unnecessarily — "
|
| 33 |
-
f"you have a limited number of action steps and must avoid exceeding them."
|
|
|
|
|
|
|
|
|
|
| 34 |
),
|
| 35 |
)
|
| 36 |
|
|
@@ -88,17 +95,18 @@ def run(query: str, file_url: str | None = None, max_retries: int = 3) -> str:
|
|
| 88 |
|
| 89 |
|
| 90 |
if __name__ == "__main__":
|
| 91 |
-
run(input("Query:"))
|
| 92 |
-
#
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
|
|
|
|
|
| 5 |
from langchain_core.messages import HumanMessage
|
| 6 |
from agent.tools.math_solver import math_solver
|
| 7 |
from agent.tools.file_downloader import file_downloader
|
| 8 |
+
from agent.tools.ocr_reader import ocr_reader
|
| 9 |
+
from agent.tools.list_files import list_files
|
| 10 |
|
| 11 |
from agent.agents.websearchagents import web_search_agents
|
| 12 |
|
|
|
|
| 21 |
return create_agent(
|
| 22 |
model="google_genai:gemini-3-flash-preview",
|
| 23 |
# tools=[math_solver, websearch_agent, web_search_agents],
|
| 24 |
+
tools=[math_solver, web_search_agents, file_downloader, ocr_reader, list_files],
|
| 25 |
system_prompt=(
|
| 26 |
f"You are a supervisor agent. "
|
| 27 |
f"Current time is: {datetime.now(timezone.utc).isoformat()}. "
|
| 28 |
f"Your memory are out of date. "
|
| 29 |
f"For any math or calculation questions, use the math_solver tool for check, "
|
| 30 |
+
f"the accurate is the most important. "
|
| 31 |
f"All questions that need real-time, must use the web_search_agents tool "
|
| 32 |
f"to get a concise and accurate final answer. "
|
| 33 |
+
f"If an image or photo file is attached, download and use the ocr_reader tool "
|
| 34 |
+
f"to extract and describe the content before answering. "
|
| 35 |
f"Once you have found the answer, respond immediately. "
|
| 36 |
f"Do NOT continue searching or verifying unnecessarily — "
|
| 37 |
+
f"you have a limited number of action steps and must avoid exceeding them. "
|
| 38 |
+
f"If you do not have enough information to answer the question "
|
| 39 |
+
f"and no tool can help, respond with: "
|
| 40 |
+
f"'Insufficient information to provide an answer.'"
|
| 41 |
),
|
| 42 |
)
|
| 43 |
|
|
|
|
| 95 |
|
| 96 |
|
| 97 |
if __name__ == "__main__":
|
| 98 |
+
# run(input("Query:"))
|
| 99 |
+
######################
|
| 100 |
+
agent = supervisor_agent()
|
| 101 |
+
chat_history: list = []
|
| 102 |
+
while True:
|
| 103 |
+
query = input("\nYou: ")
|
| 104 |
+
if query.lower() in ("exit", "quit"):
|
| 105 |
+
break
|
| 106 |
+
chat_history.append(HumanMessage(content=query))
|
| 107 |
+
result = agent.invoke({"messages": chat_history})
|
| 108 |
+
chat_history = result["messages"]
|
| 109 |
+
content = chat_history[-1].content
|
| 110 |
+
if isinstance(content, list):
|
| 111 |
+
content = content[0].get("text", "")
|
| 112 |
+
print(f"Agent: {content}")
|
agent/tools/file_downloader.py
CHANGED
|
@@ -22,7 +22,10 @@ def file_downloader(url: str) -> str:
|
|
| 22 |
filename = url.rstrip("/").split("/")[-1].split("?")[0] or "downloaded_file"
|
| 23 |
dest = WORKSPACE_DIR / filename
|
| 24 |
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
| 26 |
r.raise_for_status()
|
| 27 |
with open(dest, "wb") as f:
|
| 28 |
for chunk in r.iter_bytes():
|
|
|
|
| 22 |
filename = url.rstrip("/").split("/")[-1].split("?")[0] or "downloaded_file"
|
| 23 |
dest = WORKSPACE_DIR / filename
|
| 24 |
|
| 25 |
+
headers = {
|
| 26 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
| 27 |
+
}
|
| 28 |
+
with httpx.stream("GET", url, headers=headers, follow_redirects=True, timeout=60) as r:
|
| 29 |
r.raise_for_status()
|
| 30 |
with open(dest, "wb") as f:
|
| 31 |
for chunk in r.iter_bytes():
|
agent/tools/list_files.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
from colorama import Fore, Style # type: ignore[import]
|
| 4 |
+
from langchain_core.tools import tool
|
| 5 |
+
|
| 6 |
+
WORKSPACE_DIR = Path(__file__).resolve().parents[2] / "workspace"
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@tool
|
| 10 |
+
def list_files() -> str:
|
| 11 |
+
"""List all files in the workspace directory.
|
| 12 |
+
|
| 13 |
+
Returns:
|
| 14 |
+
A newline-separated list of filenames in the workspace directory,
|
| 15 |
+
or a message indicating the directory is empty or does not exist.
|
| 16 |
+
"""
|
| 17 |
+
if not WORKSPACE_DIR.exists():
|
| 18 |
+
return f"Workspace directory does not exist: {WORKSPACE_DIR}"
|
| 19 |
+
|
| 20 |
+
files = [f.name for f in WORKSPACE_DIR.iterdir() if f.is_file()]
|
| 21 |
+
|
| 22 |
+
if not files:
|
| 23 |
+
return "Workspace directory is empty."
|
| 24 |
+
|
| 25 |
+
print(f"{Fore.BLUE}[Workspace] {len(files)} file(s) found{Style.RESET_ALL}")
|
| 26 |
+
return "\n".join(sorted(files))
|
agent/tools/ocr_reader.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
from colorama import Fore, Style # type: ignore[import]
|
| 5 |
+
from langchain_core.messages import HumanMessage
|
| 6 |
+
from langchain_core.tools import tool
|
| 7 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 8 |
+
|
| 9 |
+
# Workspace directory where downloaded files are stored
|
| 10 |
+
WORKSPACE_DIR = Path(__file__).resolve().parents[2] / "workspace"
|
| 11 |
+
|
| 12 |
+
# Supported image extensions
|
| 13 |
+
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff"}
|
| 14 |
+
|
| 15 |
+
SYSTEM_PROMPT = (
|
| 16 |
+
"You are a precise OCR and image analysis assistant. "
|
| 17 |
+
"If the image is a photo or illustration, describe the content in detail "
|
| 18 |
+
"from left to right, top to bottom. "
|
| 19 |
+
"If the image contains a table, document, chessboard, or any structured data, "
|
| 20 |
+
"reproduce the full structure as a Markdown table. "
|
| 21 |
+
"Always be thorough and accurate."
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _encode_image(image_path: str) -> str:
|
| 26 |
+
"""Read and base64-encode an image file."""
|
| 27 |
+
with open(image_path, "rb") as f:
|
| 28 |
+
return base64.b64encode(f.read()).decode("utf-8")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _guess_mime_type(path: str) -> str:
|
| 32 |
+
"""Return the MIME type based on file extension."""
|
| 33 |
+
ext = Path(path).suffix.lower()
|
| 34 |
+
mime_map = {
|
| 35 |
+
".jpg": "image/jpeg",
|
| 36 |
+
".jpeg": "image/jpeg",
|
| 37 |
+
".png": "image/png",
|
| 38 |
+
".gif": "image/gif",
|
| 39 |
+
".bmp": "image/bmp",
|
| 40 |
+
".webp": "image/webp",
|
| 41 |
+
".tiff": "image/tiff",
|
| 42 |
+
}
|
| 43 |
+
return mime_map.get(ext, "image/jpeg")
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@tool
|
| 47 |
+
def ocr_reader(filename: str, query: str = "") -> str:
|
| 48 |
+
"""Read and extract content from an image file in the workspace directory.
|
| 49 |
+
|
| 50 |
+
The image must be located in the workspace directory (where file_downloader
|
| 51 |
+
saves files). Only provide the filename, not the full path.
|
| 52 |
+
|
| 53 |
+
For photos or illustrations, the content is described from left to right,
|
| 54 |
+
top to bottom in detail. For tables, documents, or chessboards, the full
|
| 55 |
+
structure is returned as a Markdown table.
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
filename: The filename of the image in the workspace directory.
|
| 59 |
+
Example: "photo.jpg", "table.png"
|
| 60 |
+
query: An optional question to ask about the image.
|
| 61 |
+
If provided, the model will focus on answering this question.
|
| 62 |
+
If omitted, the model performs general OCR / description.
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
A string containing the extracted or described content of the image.
|
| 66 |
+
"""
|
| 67 |
+
path = WORKSPACE_DIR / filename
|
| 68 |
+
if not path.exists():
|
| 69 |
+
return f"Error: file not found: {path}"
|
| 70 |
+
|
| 71 |
+
if path.suffix.lower() not in IMAGE_EXTENSIONS:
|
| 72 |
+
return f"Error: unsupported image format '{path.suffix}'. Supported: {IMAGE_EXTENSIONS}"
|
| 73 |
+
|
| 74 |
+
try:
|
| 75 |
+
print(
|
| 76 |
+
f"{Fore.YELLOW}[OCR] Reading image: {filename}"
|
| 77 |
+
f"{f' | Query: {query}' if query else ''}{Style.RESET_ALL}"
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
base64_image = _encode_image(str(path))
|
| 81 |
+
mime_type = _guess_mime_type(str(path))
|
| 82 |
+
|
| 83 |
+
llm = ChatGoogleGenerativeAI(model="gemini-3-flash-preview")
|
| 84 |
+
|
| 85 |
+
prompt = f"{SYSTEM_PROMPT}\n\nUser question: {query}" if query else SYSTEM_PROMPT
|
| 86 |
+
|
| 87 |
+
message = HumanMessage(
|
| 88 |
+
content=[
|
| 89 |
+
{"type": "text", "text": prompt},
|
| 90 |
+
{
|
| 91 |
+
"type": "image_url",
|
| 92 |
+
"image_url": {"url": f"data:{mime_type};base64,{base64_image}"},
|
| 93 |
+
},
|
| 94 |
+
]
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
response = llm.invoke([message])
|
| 98 |
+
content = response.content
|
| 99 |
+
if isinstance(content, list):
|
| 100 |
+
result = "\n".join(
|
| 101 |
+
block.get("text", "") for block in content if isinstance(block, dict)
|
| 102 |
+
)
|
| 103 |
+
else:
|
| 104 |
+
result = str(content)
|
| 105 |
+
print(f"{Fore.YELLOW}[OCR] Result:\n{result}{Style.RESET_ALL}")
|
| 106 |
+
return result
|
| 107 |
+
except Exception as e:
|
| 108 |
+
return f"Error processing image: {e}"
|