File size: 10,912 Bytes
539dfc6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 |
"""
Visual QA Tool - A tool for answering questions about images
This module provides functionality to analyze images and answer questions about them.
It leverages powerful vision-language models (VLMs) to understand image content and
respond to natural language questions about the images.
The module offers two implementations:
1. VisualQATool class - Uses Hugging Face's IDEFICS-2 model
2. visualizer function - Uses OpenAI's GPT-4o model with vision capabilities
Both implementations handle image loading, processing, and API communication to
provide detailed responses about image content.
Environment variables required:
- OPENAI_API_KEY: API key for OpenAI (for the visualizer function)
"""
import base64
import json
import mimetypes
import os
import uuid
from io import BytesIO
import PIL.Image
import requests
from dotenv import load_dotenv
from huggingface_hub import InferenceClient
from smolagents import Tool, tool
# Load environment variables from .env file
load_dotenv(override=True)
def process_images_and_text(image_path, query, client):
"""
Process images and text using the IDEFICS-2 model from Hugging Face.
This function handles the formatting of prompts and images for the IDEFICS-2 model,
which is a powerful vision-language model capable of understanding images and text.
Args:
image_path (str): Path to the image file to analyze
query (str): The question or instruction about the image
client (InferenceClient): Hugging Face inference client for the model
Returns:
str: The model's response to the query about the image
"""
from transformers import AutoProcessor
# Format messages for the chat template
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": query},
],
},
]
# Load the processor for the IDEFICS-2 model
idefics_processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-chatty")
prompt_with_template = idefics_processor.apply_chat_template(messages, add_generation_prompt=True)
# Define a nested function to encode local images
def encode_local_image(image_path):
"""
Encode a local image file to a base64 string for API transmission.
Args:
image_path (str): Path to the local image file
Returns:
str: Base64-encoded image with proper formatting for the API
"""
# Load image and convert to RGB format
image = PIL.Image.open(image_path).convert("RGB")
# Convert the image to a base64 string
buffer = BytesIO()
image.save(buffer, format="JPEG") # Use the appropriate format (e.g., JPEG, PNG)
base64_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
# Add string formatting required by the endpoint
image_string = f"data:image/jpeg;base64,{base64_image}"
return image_string
# Encode the image and insert it into the prompt template
image_string = encode_local_image(image_path)
prompt_with_images = prompt_with_template.replace("<image>", " ").format(image_string)
# Prepare the payload for the API request
payload = {
"inputs": prompt_with_images,
"parameters": {
"return_full_text": False,
"max_new_tokens": 200, # Limit response length
},
}
# Send the request to the API and parse the response
return json.loads(client.post(json=payload).decode())[0]
# Function to encode images for API transmission
def encode_image(image_path):
"""
Encode an image for API transmission, handling both URLs and local files.
If the image_path is a URL, the function will download the image first.
Args:
image_path (str): Path or URL to the image
Returns:
str: Base64-encoded image string
"""
# Handle URL-based images by downloading them first
if image_path.startswith("http"):
# Set up a user agent to avoid being blocked by websites
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
request_kwargs = {
"headers": {"User-Agent": user_agent},
"stream": True, # Stream the download for large files
}
# Send a HTTP request to the URL
response = requests.get(image_path, **request_kwargs)
response.raise_for_status() # Raise an exception for HTTP errors
content_type = response.headers.get("content-type", "")
# Determine the file extension from the content type
extension = mimetypes.guess_extension(content_type)
if extension is None:
extension = ".download" # Default extension if unknown
# Generate a unique filename and save the downloaded image
fname = str(uuid.uuid4()) + extension
download_path = os.path.abspath(os.path.join("downloads", fname))
with open(download_path, "wb") as fh:
for chunk in response.iter_content(chunk_size=512):
fh.write(chunk)
# Update the image_path to the local downloaded file
image_path = download_path
# Encode the local image file to base64
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def resize_image(image_path):
"""
Resize an image to half its original dimensions.
This function is used when the original image is too large for the API.
Args:
image_path (str): Path to the image file
Returns:
str: Path to the resized image
"""
# Open and get dimensions of the image
img = PIL.Image.open(image_path)
width, height = img.size
# Resize to half the original dimensions
img = img.resize((int(width / 2), int(height / 2)))
# Save with a new filename
new_image_path = f"resized_{image_path}"
img.save(new_image_path)
return new_image_path
class VisualQATool(Tool):
"""
A tool that can answer questions about images using the IDEFICS-2 model.
This class implements the Tool interface from smolagents and provides
functionality to analyze images and answer questions about them.
"""
name = "visualizer"
description = "A tool that can answer questions about attached images."
inputs = {
"image_path": {
"description": "The path to the image on which to answer the question",
"type": "string",
},
"question": {"description": "the question to answer", "type": "string", "nullable": True},
}
output_type = "string"
# Initialize the Hugging Face inference client for IDEFICS-2
client = InferenceClient("HuggingFaceM4/idefics2-8b-chatty")
def forward(self, image_path: str, question: str | None = None) -> str:
"""
Process an image and answer a question about it.
If no question is provided, the function will generate a detailed caption.
Args:
image_path (str): Path to the image file
question (str, optional): Question to answer about the image
Returns:
str: Answer to the question or a caption for the image
"""
output = ""
add_note = False
# If no question is provided, default to generating a caption
if not question:
add_note = True
question = "Please write a detailed caption for this image."
try:
# Try to process the image and question
output = process_images_and_text(image_path, question, self.client)
except Exception as e:
print(e)
# If the image is too large, resize it and try again
if "Payload Too Large" in str(e):
new_image_path = resize_image(image_path)
output = process_images_and_text(new_image_path, question, self.client)
# Add a note if we generated a caption instead of answering a question
if add_note:
output = (
f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
)
return output
@tool
def visualizer(image_path: str, question: str | None = None) -> str:
"""
A tool that can answer questions about attached images using OpenAI's GPT-4o model.
This function provides an alternative implementation using OpenAI's vision capabilities
instead of the Hugging Face model used in VisualQATool.
Args:
image_path: The path to the image on which to answer the question. This should be a local path to downloaded image.
question: The question to answer.
Returns:
str: Answer to the question or a caption for the image
"""
import mimetypes
import os
import requests
from .visual_qa import encode_image
# If no question is provided, default to generating a caption
add_note = False
if not question:
add_note = True
question = "Please write a detailed caption for this image."
# Validate input
if not isinstance(image_path, str):
raise Exception("You should provide at least `image_path` string argument to this tool!")
# Determine the MIME type and encode the image
mime_type, _ = mimetypes.guess_type(image_path)
base64_image = encode_image(image_path)
# Prepare the payload for the OpenAI API request
payload = {
"model": "gpt-4o", # Using GPT-4o with vision capabilities
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}},
],
}
],
"max_tokens": 1000, # Limit response length
}
# Set up headers with API key
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"}
# Send the request to the OpenAI API
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
# Parse the response
try:
output = response.json()["choices"][0]["message"]["content"]
except Exception:
raise Exception(f"Response format unexpected: {response.json()}")
# Add a note if we generated a caption instead of answering a question
if add_note:
output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
return output
|