|
import base64 |
|
import os |
|
|
|
from openai import OpenAI |
|
from smolagents import Tool |
|
|
|
client = OpenAI() |
|
|
|
|
|
class DescribeImageTool(Tool): |
|
""" |
|
Tool to analyze and describe any image using GPT-4 Vision API. |
|
|
|
Args: |
|
image_path (str): Path to the image file. |
|
description_type (str): Type of description to generate. Options: |
|
- "general": General description of the image |
|
- "detailed": Detailed analysis of the image |
|
- "chess": Analysis of a chess position |
|
- "text": Extract and describe text from the image |
|
- "custom": Custom description based on user prompt |
|
|
|
Returns: |
|
str: Description of the image based on the requested type. |
|
""" |
|
|
|
name = "describe_image" |
|
description = "Analyzes and describes images using GPT-4 Vision API" |
|
inputs = { |
|
"image_path": {"type": "string", "description": "Path to the image file"}, |
|
"description_type": { |
|
"type": "string", |
|
"description": "Type of description to generate (general, detailed, chess, text, custom)", |
|
"nullable": True, |
|
}, |
|
"custom_prompt": { |
|
"type": "string", |
|
"description": "Custom prompt for description (only used when description_type is 'custom')", |
|
"nullable": True, |
|
}, |
|
} |
|
output_type = "string" |
|
|
|
def encode_image(self, image_path: str) -> str: |
|
"""Encode image to base64 string.""" |
|
with open(image_path, "rb") as image_file: |
|
return base64.b64encode(image_file.read()).decode("utf-8") |
|
|
|
def get_prompt(self, description_type: str, custom_prompt: str = None) -> str: |
|
"""Get appropriate prompt based on description type.""" |
|
prompts = { |
|
"general": "Provide a general description of this image. Focus on the main subjects, colors, and overall scene.", |
|
"detailed": """Analyze this image in detail. Include: |
|
1. Main subjects and their relationships |
|
2. Colors, lighting, and composition |
|
3. Any text or symbols present |
|
4. Context or possible meaning |
|
5. Notable details or interesting elements""", |
|
"chess": """Analyze this chess position and provide a detailed description including: |
|
1. List of pieces on the board for both white and black |
|
2. Whose turn it is to move |
|
3. Basic evaluation of the position |
|
4. Any immediate tactical opportunities or threats |
|
5. Suggested next moves with brief explanations""", |
|
"text": "Extract and describe any text present in this image. If there are multiple pieces of text, organize them clearly.", |
|
} |
|
return ( |
|
custom_prompt |
|
if description_type == "custom" |
|
else prompts.get(description_type, prompts["general"]) |
|
) |
|
|
|
def forward( |
|
self, |
|
image_path: str, |
|
description_type: str = "general", |
|
custom_prompt: str = None, |
|
) -> str: |
|
try: |
|
if not os.path.exists(image_path): |
|
return f"Error: Image file not found at {image_path}" |
|
|
|
|
|
base64_image = self.encode_image(image_path) |
|
|
|
|
|
prompt = self.get_prompt(description_type, custom_prompt) |
|
|
|
|
|
response = client.chat.completions.create( |
|
model="gpt-4.1", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "text", "text": prompt}, |
|
{ |
|
"type": "image_url", |
|
"image_url": { |
|
"url": f"data:image/jpeg;base64,{base64_image}" |
|
}, |
|
}, |
|
], |
|
} |
|
], |
|
max_tokens=1000, |
|
) |
|
|
|
return response.choices[0].message.content |
|
|
|
except Exception as e: |
|
return f"Error analyzing image: {str(e)}" |
|
|