Final_Assignment_Template

Sleeping

AlexTrinityBlock commited on 29 days ago

Commit

7a0b5ad

1 Parent(s): 01faebd

feat(tools): add ocr_reader and list_files tools to expand agent capabilities

- Add ocr_reader tool to extract text from image and photo attachments
- Add list_files tool for directory listing functionality
- Update file_downloader with User-Agent headers to avoid request blocks
- Extend supervisor prompt with image handling and insufficient information response

Files changed (4) hide show

agent/agent.py +25 -17
agent/tools/file_downloader.py +4 -1
agent/tools/list_files.py +26 -0
agent/tools/ocr_reader.py +108 -0

agent/agent.py CHANGED Viewed

@@ -5,6 +5,8 @@ from langchain.agents import create_agent
 from langchain_core.messages import HumanMessage
 from agent.tools.math_solver import math_solver
 from agent.tools.file_downloader import file_downloader
 from agent.agents.websearchagents import web_search_agents
@@ -19,18 +21,23 @@ def supervisor_agent():
     return create_agent(
         model="google_genai:gemini-3-flash-preview",
         # tools=[math_solver, websearch_agent, web_search_agents],
-        tools=[math_solver, web_search_agents, file_downloader],
         system_prompt=(
             f"You are a supervisor agent. "
             f"Current time is: {datetime.now(timezone.utc).isoformat()}. "
             f"Your memory are out of date. "
             f"For any math or calculation questions, use the math_solver tool for check, "
-            f"the accurate is the most important."
             f"All questions that need real-time, must use the web_search_agents tool "
             f"to get a concise and accurate final answer. "
             f"Once you have found the answer, respond immediately. "
             f"Do NOT continue searching or verifying unnecessarily — "
-            f"you have a limited number of action steps and must avoid exceeding them."
         ),
     )
@@ -88,17 +95,18 @@ def run(query: str, file_url: str | None = None, max_retries: int = 3) -> str:
 if __name__ == "__main__":
-    run(input("Query:"))
-    # agent = supervisor_agent()
-    # chat_history: list = []
-    # while True:
-    #     query = input("\nYou: ")
-    #     if query.lower() in ("exit", "quit"):
-    #         break
-    #     chat_history.append(HumanMessage(content=query))
-    #     result = agent.invoke({"messages": chat_history})
-    #     chat_history = result["messages"]
-    #     content = chat_history[-1].content
-    #     if isinstance(content, list):
-    #         content = content[0].get("text", "")
-    #     print(f"Agent: {content}")

 from langchain_core.messages import HumanMessage
 from agent.tools.math_solver import math_solver
 from agent.tools.file_downloader import file_downloader
+from agent.tools.ocr_reader import ocr_reader
+from agent.tools.list_files import list_files
 from agent.agents.websearchagents import web_search_agents
     return create_agent(
         model="google_genai:gemini-3-flash-preview",
         # tools=[math_solver, websearch_agent, web_search_agents],
+        tools=[math_solver, web_search_agents, file_downloader, ocr_reader, list_files],
         system_prompt=(
             f"You are a supervisor agent. "
             f"Current time is: {datetime.now(timezone.utc).isoformat()}. "
             f"Your memory are out of date. "
             f"For any math or calculation questions, use the math_solver tool for check, "
+            f"the accurate is the most important. "
             f"All questions that need real-time, must use the web_search_agents tool "
             f"to get a concise and accurate final answer. "
+            f"If an image or photo file is attached, download and use the ocr_reader tool "
+            f"to extract and describe the content before answering. "
             f"Once you have found the answer, respond immediately. "
             f"Do NOT continue searching or verifying unnecessarily — "
+            f"you have a limited number of action steps and must avoid exceeding them. "
+            f"If you do not have enough information to answer the question "
+            f"and no tool can help, respond with: "
+            f"'Insufficient information to provide an answer.'"
         ),
     )
 if __name__ == "__main__":
+    # run(input("Query:"))
+    ######################
+    agent = supervisor_agent()
+    chat_history: list = []
+    while True:
+        query = input("\nYou: ")
+        if query.lower() in ("exit", "quit"):
+            break
+        chat_history.append(HumanMessage(content=query))
+        result = agent.invoke({"messages": chat_history})
+        chat_history = result["messages"]
+        content = chat_history[-1].content
+        if isinstance(content, list):
+            content = content[0].get("text", "")
+        print(f"Agent: {content}")

agent/tools/file_downloader.py CHANGED Viewed

@@ -22,7 +22,10 @@ def file_downloader(url: str) -> str:
         filename = url.rstrip("/").split("/")[-1].split("?")[0] or "downloaded_file"
         dest = WORKSPACE_DIR / filename
-        with httpx.stream("GET", url, follow_redirects=True, timeout=60) as r:
             r.raise_for_status()
             with open(dest, "wb") as f:
                 for chunk in r.iter_bytes():

         filename = url.rstrip("/").split("/")[-1].split("?")[0] or "downloaded_file"
         dest = WORKSPACE_DIR / filename
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+        }
+        with httpx.stream("GET", url, headers=headers, follow_redirects=True, timeout=60) as r:
             r.raise_for_status()
             with open(dest, "wb") as f:
                 for chunk in r.iter_bytes():

agent/tools/list_files.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from pathlib import Path
+from colorama import Fore, Style  # type: ignore[import]
+from langchain_core.tools import tool
+WORKSPACE_DIR = Path(__file__).resolve().parents[2] / "workspace"
+@tool
+def list_files() -> str:
+    """List all files in the workspace directory.
+    Returns:
+        A newline-separated list of filenames in the workspace directory,
+        or a message indicating the directory is empty or does not exist.
+    """
+    if not WORKSPACE_DIR.exists():
+        return f"Workspace directory does not exist: {WORKSPACE_DIR}"
+    files = [f.name for f in WORKSPACE_DIR.iterdir() if f.is_file()]
+    if not files:
+        return "Workspace directory is empty."
+    print(f"{Fore.BLUE}[Workspace] {len(files)} file(s) found{Style.RESET_ALL}")
+    return "\n".join(sorted(files))

agent/tools/ocr_reader.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import base64
+from pathlib import Path
+from colorama import Fore, Style  # type: ignore[import]
+from langchain_core.messages import HumanMessage
+from langchain_core.tools import tool
+from langchain_google_genai import ChatGoogleGenerativeAI
+# Workspace directory where downloaded files are stored
+WORKSPACE_DIR = Path(__file__).resolve().parents[2] / "workspace"
+# Supported image extensions
+IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff"}
+SYSTEM_PROMPT = (
+    "You are a precise OCR and image analysis assistant. "
+    "If the image is a photo or illustration, describe the content in detail "
+    "from left to right, top to bottom. "
+    "If the image contains a table, document, chessboard, or any structured data, "
+    "reproduce the full structure as a Markdown table. "
+    "Always be thorough and accurate."
+)
+def _encode_image(image_path: str) -> str:
+    """Read and base64-encode an image file."""
+    with open(image_path, "rb") as f:
+        return base64.b64encode(f.read()).decode("utf-8")
+def _guess_mime_type(path: str) -> str:
+    """Return the MIME type based on file extension."""
+    ext = Path(path).suffix.lower()
+    mime_map = {
+        ".jpg": "image/jpeg",
+        ".jpeg": "image/jpeg",
+        ".png": "image/png",
+        ".gif": "image/gif",
+        ".bmp": "image/bmp",
+        ".webp": "image/webp",
+        ".tiff": "image/tiff",
+    }
+    return mime_map.get(ext, "image/jpeg")
+@tool
+def ocr_reader(filename: str, query: str = "") -> str:
+    """Read and extract content from an image file in the workspace directory.
+    The image must be located in the workspace directory (where file_downloader
+    saves files). Only provide the filename, not the full path.
+    For photos or illustrations, the content is described from left to right,
+    top to bottom in detail. For tables, documents, or chessboards, the full
+    structure is returned as a Markdown table.
+    Args:
+        filename: The filename of the image in the workspace directory.
+                  Example: "photo.jpg", "table.png"
+        query: An optional question to ask about the image.
+               If provided, the model will focus on answering this question.
+               If omitted, the model performs general OCR / description.
+    Returns:
+        A string containing the extracted or described content of the image.
+    """
+    path = WORKSPACE_DIR / filename
+    if not path.exists():
+        return f"Error: file not found: {path}"
+    if path.suffix.lower() not in IMAGE_EXTENSIONS:
+        return f"Error: unsupported image format '{path.suffix}'. Supported: {IMAGE_EXTENSIONS}"
+    try:
+        print(
+            f"{Fore.YELLOW}[OCR] Reading image: {filename}"
+            f"{f' | Query: {query}' if query else ''}{Style.RESET_ALL}"
+        )
+        base64_image = _encode_image(str(path))
+        mime_type = _guess_mime_type(str(path))
+        llm = ChatGoogleGenerativeAI(model="gemini-3-flash-preview")
+        prompt = f"{SYSTEM_PROMPT}\n\nUser question: {query}" if query else SYSTEM_PROMPT
+        message = HumanMessage(
+            content=[
+                {"type": "text", "text": prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:{mime_type};base64,{base64_image}"},
+                },
+            ]
+        )
+        response = llm.invoke([message])
+        content = response.content
+        if isinstance(content, list):
+            result = "\n".join(
+                block.get("text", "") for block in content if isinstance(block, dict)
+            )
+        else:
+            result = str(content)
+        print(f"{Fore.YELLOW}[OCR] Result:\n{result}{Style.RESET_ALL}")
+        return result
+    except Exception as e:
+        return f"Error processing image: {e}"