Final_Assignment_Template / vision_tool.py
ABVM's picture
Update vision_tool.py
4985a13 verified
# Vision tool using Groq's Meta-Llama Scout model
from smolagents import tool
from groq import Groq
import os
def _llama_analyze(image_b64: str, prompt: str) -> str:
"""Internal helper to query the Llama vision model."""
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
],
}
]
response = client.chat.completions.create(
#model="meta-llama/llama-4-scout-17b-16e-instruct",
model ="qwen/qwen-qwq-32b",
messages=messages,
stream=False,
max_completion_tokens=512,
)
return response.choices[0].message.content
@tool
def image_reasoning_tool(image_file: str, prompt: str | None = None) -> dict:
"""Perform OCR and optional vision analysis on an image.
This single entry point unifies OCR extraction and Llama vision reasoning so
the planner only sees one image tool.
Args:
image_file: Path to the image file to analyze.
prompt: Optional instruction for the vision model. If omitted, only OCR
is performed.
Returns:
Dictionary with OCR text, base64 image data and optional vision model
response.
"""
try:
from PIL import Image
from smolagents.utils import encode_image_base64
import pytesseract
image = Image.open(image_file)
b64 = encode_image_base64(image)
ocr_text = pytesseract.image_to_string(image)
vision_text = ""
if prompt:
try:
vision_text = _llama_analyze(b64, prompt)
except Exception as e: # vision errors shouldn't break OCR result
vision_text = f"Error processing image with vision model: {e}"
return {"ocr_text": ocr_text, "vision_text": vision_text, "base64_image": b64}
except Exception as e:
return {
"ocr_text": "",
"vision_text": "",
"base64_image": "",
"error": str(e),
}