ABVM commited on
Commit
a4dd17a
·
verified ·
1 Parent(s): fcddfb2

Delete vision_tool.py

Browse files
Files changed (1) hide show
  1. vision_tool.py +0 -70
vision_tool.py DELETED
@@ -1,70 +0,0 @@
1
- # Vision tool using Groq's Meta-Llama Scout model
2
- from smolagents import tool
3
- from groq import Groq
4
-
5
- import os
6
-
7
-
8
- def _llama_analyze(image_b64: str, prompt: str) -> str:
9
- """Internal helper to query the Llama vision model."""
10
- client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
11
- messages = [
12
- {
13
- "role": "user",
14
- "content": [
15
- {"type": "text", "text": prompt},
16
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
17
- ],
18
- }
19
- ]
20
- response = client.chat.completions.create(
21
- model="meta-llama/llama-4-scout-17b-16e-instruct",
22
- messages=messages,
23
- stream=False,
24
- max_tokens=512,
25
- )
26
- return response.choices[0].message.content
27
-
28
-
29
- @tool
30
- def image_reasoning_tool(image_file: str, prompt: str | None = None) -> dict:
31
- """Perform OCR and optional vision analysis on an image.
32
-
33
- This single entry point unifies OCR extraction and Llama vision reasoning so
34
- the planner only sees one image tool.
35
-
36
- Args:
37
- image_file: Path to the image file to analyze.
38
- prompt: Optional instruction for the vision model. If omitted, only OCR
39
- is performed.
40
-
41
- Returns:
42
- Dictionary with OCR text, base64 image data and optional vision model
43
- response.
44
- """
45
- try:
46
- from PIL import Image
47
- from smolagents.utils import encode_image_base64
48
- import pytesseract
49
-
50
- image = Image.open(image_file)
51
- b64 = encode_image_base64(image)
52
- ocr_text = pytesseract.image_to_string(image)
53
-
54
- vision_text = ""
55
- if prompt:
56
- try:
57
- vision_text = _llama_analyze(b64, prompt)
58
- except Exception as e: # vision errors shouldn't break OCR result
59
- vision_text = f"Error processing image with vision model: {e}"
60
-
61
- return {"ocr_text": ocr_text, "vision_text": vision_text, "base64_image": b64}
62
- except Exception as e:
63
- return {
64
- "ocr_text": "",
65
- "vision_text": "",
66
- "base64_image": "",
67
- "error": str(e),
68
- }
69
-
70
-