File size: 10,912 Bytes
539dfc6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
"""
Visual QA Tool - A tool for answering questions about images

This module provides functionality to analyze images and answer questions about them.
It leverages powerful vision-language models (VLMs) to understand image content and
respond to natural language questions about the images.

The module offers two implementations:
1. VisualQATool class - Uses Hugging Face's IDEFICS-2 model
2. visualizer function - Uses OpenAI's GPT-4o model with vision capabilities

Both implementations handle image loading, processing, and API communication to
provide detailed responses about image content.

Environment variables required:
- OPENAI_API_KEY: API key for OpenAI (for the visualizer function)
"""

import base64
import json
import mimetypes
import os
import uuid
from io import BytesIO

import PIL.Image
import requests
from dotenv import load_dotenv
from huggingface_hub import InferenceClient

from smolagents import Tool, tool


# Load environment variables from .env file
load_dotenv(override=True)


def process_images_and_text(image_path, query, client):
    """
    Process images and text using the IDEFICS-2 model from Hugging Face.
    
    This function handles the formatting of prompts and images for the IDEFICS-2 model,
    which is a powerful vision-language model capable of understanding images and text.
    
    Args:
        image_path (str): Path to the image file to analyze
        query (str): The question or instruction about the image
        client (InferenceClient): Hugging Face inference client for the model
        
    Returns:
        str: The model's response to the query about the image
    """
    from transformers import AutoProcessor

    # Format messages for the chat template
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": query},
            ],
        },
    ]
    
    # Load the processor for the IDEFICS-2 model
    idefics_processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-chatty")
    prompt_with_template = idefics_processor.apply_chat_template(messages, add_generation_prompt=True)

    # Define a nested function to encode local images
    def encode_local_image(image_path):
        """
        Encode a local image file to a base64 string for API transmission.
        
        Args:
            image_path (str): Path to the local image file
            
        Returns:
            str: Base64-encoded image with proper formatting for the API
        """
        # Load image and convert to RGB format
        image = PIL.Image.open(image_path).convert("RGB")

        # Convert the image to a base64 string
        buffer = BytesIO()
        image.save(buffer, format="JPEG")  # Use the appropriate format (e.g., JPEG, PNG)
        base64_image = base64.b64encode(buffer.getvalue()).decode("utf-8")

        # Add string formatting required by the endpoint
        image_string = f"data:image/jpeg;base64,{base64_image}"

        return image_string

    # Encode the image and insert it into the prompt template
    image_string = encode_local_image(image_path)
    prompt_with_images = prompt_with_template.replace("<image>", "![]({}) ").format(image_string)

    # Prepare the payload for the API request
    payload = {
        "inputs": prompt_with_images,
        "parameters": {
            "return_full_text": False,
            "max_new_tokens": 200,  # Limit response length
        },
    }

    # Send the request to the API and parse the response
    return json.loads(client.post(json=payload).decode())[0]


# Function to encode images for API transmission
def encode_image(image_path):
    """
    Encode an image for API transmission, handling both URLs and local files.
    
    If the image_path is a URL, the function will download the image first.
    
    Args:
        image_path (str): Path or URL to the image
        
    Returns:
        str: Base64-encoded image string
    """
    # Handle URL-based images by downloading them first
    if image_path.startswith("http"):
        # Set up a user agent to avoid being blocked by websites
        user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
        request_kwargs = {
            "headers": {"User-Agent": user_agent},
            "stream": True,  # Stream the download for large files
        }

        # Send a HTTP request to the URL
        response = requests.get(image_path, **request_kwargs)
        response.raise_for_status()  # Raise an exception for HTTP errors
        content_type = response.headers.get("content-type", "")

        # Determine the file extension from the content type
        extension = mimetypes.guess_extension(content_type)
        if extension is None:
            extension = ".download"  # Default extension if unknown

        # Generate a unique filename and save the downloaded image
        fname = str(uuid.uuid4()) + extension
        download_path = os.path.abspath(os.path.join("downloads", fname))

        with open(download_path, "wb") as fh:
            for chunk in response.iter_content(chunk_size=512):
                fh.write(chunk)

        # Update the image_path to the local downloaded file
        image_path = download_path

    # Encode the local image file to base64
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


def resize_image(image_path):
    """
    Resize an image to half its original dimensions.
    
    This function is used when the original image is too large for the API.
    
    Args:
        image_path (str): Path to the image file
        
    Returns:
        str: Path to the resized image
    """
    # Open and get dimensions of the image
    img = PIL.Image.open(image_path)
    width, height = img.size
    
    # Resize to half the original dimensions
    img = img.resize((int(width / 2), int(height / 2)))
    
    # Save with a new filename
    new_image_path = f"resized_{image_path}"
    img.save(new_image_path)
    
    return new_image_path


class VisualQATool(Tool):
    """
    A tool that can answer questions about images using the IDEFICS-2 model.
    
    This class implements the Tool interface from smolagents and provides
    functionality to analyze images and answer questions about them.
    """
    name = "visualizer"
    description = "A tool that can answer questions about attached images."
    inputs = {
        "image_path": {
            "description": "The path to the image on which to answer the question",
            "type": "string",
        },
        "question": {"description": "the question to answer", "type": "string", "nullable": True},
    }
    output_type = "string"

    # Initialize the Hugging Face inference client for IDEFICS-2
    client = InferenceClient("HuggingFaceM4/idefics2-8b-chatty")

    def forward(self, image_path: str, question: str | None = None) -> str:
        """
        Process an image and answer a question about it.
        
        If no question is provided, the function will generate a detailed caption.
        
        Args:
            image_path (str): Path to the image file
            question (str, optional): Question to answer about the image
            
        Returns:
            str: Answer to the question or a caption for the image
        """
        output = ""
        add_note = False
        
        # If no question is provided, default to generating a caption
        if not question:
            add_note = True
            question = "Please write a detailed caption for this image."
            
        try:
            # Try to process the image and question
            output = process_images_and_text(image_path, question, self.client)
        except Exception as e:
            print(e)
            # If the image is too large, resize it and try again
            if "Payload Too Large" in str(e):
                new_image_path = resize_image(image_path)
                output = process_images_and_text(new_image_path, question, self.client)

        # Add a note if we generated a caption instead of answering a question
        if add_note:
            output = (
                f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
            )

        return output


@tool
def visualizer(image_path: str, question: str | None = None) -> str:
    """
    A tool that can answer questions about attached images using OpenAI's GPT-4o model.
    
    This function provides an alternative implementation using OpenAI's vision capabilities
    instead of the Hugging Face model used in VisualQATool.

    Args:
        image_path: The path to the image on which to answer the question. This should be a local path to downloaded image.
        question: The question to answer.
        
    Returns:
        str: Answer to the question or a caption for the image
    """
    import mimetypes
    import os

    import requests

    from .visual_qa import encode_image

    # If no question is provided, default to generating a caption
    add_note = False
    if not question:
        add_note = True
        question = "Please write a detailed caption for this image."
        
    # Validate input
    if not isinstance(image_path, str):
        raise Exception("You should provide at least `image_path` string argument to this tool!")

    # Determine the MIME type and encode the image
    mime_type, _ = mimetypes.guess_type(image_path)
    base64_image = encode_image(image_path)

    # Prepare the payload for the OpenAI API request
    payload = {
        "model": "gpt-4o",  # Using GPT-4o with vision capabilities
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": question},
                    {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}},
                ],
            }
        ],
        "max_tokens": 1000,  # Limit response length
    }
    
    # Set up headers with API key
    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"}
    
    # Send the request to the OpenAI API
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    
    # Parse the response
    try:
        output = response.json()["choices"][0]["message"]["content"]
    except Exception:
        raise Exception(f"Response format unexpected: {response.json()}")

    # Add a note if we generated a caption instead of answering a question
    if add_note:
        output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}"

    return output