Spaces:

memonkey01
/

Final_Assignment_Agent_Course

Sleeping

App Files Files Community

Final_Assignment_Agent_Course / scripts /visual_qa.py

memonkeyv01

Initial commit for Final Assignment Agent

539dfc6 4 days ago

raw

history blame contribute delete

10.9 kB

	"""
	Visual QA Tool - A tool for answering questions about images

	This module provides functionality to analyze images and answer questions about them.
	It leverages powerful vision-language models (VLMs) to understand image content and
	respond to natural language questions about the images.

	The module offers two implementations:
	1. VisualQATool class - Uses Hugging Face's IDEFICS-2 model
	2. visualizer function - Uses OpenAI's GPT-4o model with vision capabilities

	Both implementations handle image loading, processing, and API communication to
	provide detailed responses about image content.

	Environment variables required:
	- OPENAI_API_KEY: API key for OpenAI (for the visualizer function)
	"""

	import base64
	import json
	import mimetypes
	import os
	import uuid
	from io import BytesIO

	import PIL.Image
	import requests
	from dotenv import load_dotenv
	from huggingface_hub import InferenceClient

	from smolagents import Tool, tool


	# Load environment variables from .env file
	load_dotenv(override=True)


	def process_images_and_text(image_path, query, client):
	"""
	Process images and text using the IDEFICS-2 model from Hugging Face.

	This function handles the formatting of prompts and images for the IDEFICS-2 model,
	which is a powerful vision-language model capable of understanding images and text.

	Args:
	image_path (str): Path to the image file to analyze
	query (str): The question or instruction about the image
	client (InferenceClient): Hugging Face inference client for the model

	Returns:
	str: The model's response to the query about the image
	"""
	from transformers import AutoProcessor

	# Format messages for the chat template
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image"},
	{"type": "text", "text": query},
	],
	},
	]

	# Load the processor for the IDEFICS-2 model
	idefics_processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-chatty")
	prompt_with_template = idefics_processor.apply_chat_template(messages, add_generation_prompt=True)

	# Define a nested function to encode local images
	def encode_local_image(image_path):
	"""
	Encode a local image file to a base64 string for API transmission.

	Args:
	image_path (str): Path to the local image file

	Returns:
	str: Base64-encoded image with proper formatting for the API
	"""
	# Load image and convert to RGB format
	image = PIL.Image.open(image_path).convert("RGB")

	# Convert the image to a base64 string
	buffer = BytesIO()
	image.save(buffer, format="JPEG") # Use the appropriate format (e.g., JPEG, PNG)
	base64_image = base64.b64encode(buffer.getvalue()).decode("utf-8")

	# Add string formatting required by the endpoint
	image_string = f"data:image/jpeg;base64,{base64_image}"

	return image_string

	# Encode the image and insert it into the prompt template
	image_string = encode_local_image(image_path)
	prompt_with_images = prompt_with_template.replace("<image>", "![]({}) ").format(image_string)

	# Prepare the payload for the API request
	payload = {
	"inputs": prompt_with_images,
	"parameters": {
	"return_full_text": False,
	"max_new_tokens": 200, # Limit response length
	},
	}

	# Send the request to the API and parse the response
	return json.loads(client.post(json=payload).decode())[0]


	# Function to encode images for API transmission
	def encode_image(image_path):
	"""
	Encode an image for API transmission, handling both URLs and local files.

	If the image_path is a URL, the function will download the image first.

	Args:
	image_path (str): Path or URL to the image

	Returns:
	str: Base64-encoded image string
	"""
	# Handle URL-based images by downloading them first
	if image_path.startswith("http"):
	# Set up a user agent to avoid being blocked by websites
	user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
	request_kwargs = {
	"headers": {"User-Agent": user_agent},
	"stream": True, # Stream the download for large files
	}

	# Send a HTTP request to the URL
	response = requests.get(image_path, **request_kwargs)
	response.raise_for_status() # Raise an exception for HTTP errors
	content_type = response.headers.get("content-type", "")

	# Determine the file extension from the content type
	extension = mimetypes.guess_extension(content_type)
	if extension is None:
	extension = ".download" # Default extension if unknown

	# Generate a unique filename and save the downloaded image
	fname = str(uuid.uuid4()) + extension
	download_path = os.path.abspath(os.path.join("downloads", fname))

	with open(download_path, "wb") as fh:
	for chunk in response.iter_content(chunk_size=512):
	fh.write(chunk)

	# Update the image_path to the local downloaded file
	image_path = download_path

	# Encode the local image file to base64
	with open(image_path, "rb") as image_file:
	return base64.b64encode(image_file.read()).decode("utf-8")


	def resize_image(image_path):
	"""
	Resize an image to half its original dimensions.

	This function is used when the original image is too large for the API.

	Args:
	image_path (str): Path to the image file

	Returns:
	str: Path to the resized image
	"""
	# Open and get dimensions of the image
	img = PIL.Image.open(image_path)
	width, height = img.size

	# Resize to half the original dimensions
	img = img.resize((int(width / 2), int(height / 2)))

	# Save with a new filename
	new_image_path = f"resized_{image_path}"
	img.save(new_image_path)

	return new_image_path


	class VisualQATool(Tool):
	"""
	A tool that can answer questions about images using the IDEFICS-2 model.

	This class implements the Tool interface from smolagents and provides
	functionality to analyze images and answer questions about them.
	"""
	name = "visualizer"
	description = "A tool that can answer questions about attached images."
	inputs = {
	"image_path": {
	"description": "The path to the image on which to answer the question",
	"type": "string",
	},
	"question": {"description": "the question to answer", "type": "string", "nullable": True},
	}
	output_type = "string"

	# Initialize the Hugging Face inference client for IDEFICS-2
	client = InferenceClient("HuggingFaceM4/idefics2-8b-chatty")

	def forward(self, image_path: str, question: str \| None = None) -> str:
	"""
	Process an image and answer a question about it.

	If no question is provided, the function will generate a detailed caption.

	Args:
	image_path (str): Path to the image file
	question (str, optional): Question to answer about the image

	Returns:
	str: Answer to the question or a caption for the image
	"""
	output = ""
	add_note = False

	# If no question is provided, default to generating a caption
	if not question:
	add_note = True
	question = "Please write a detailed caption for this image."

	try:
	# Try to process the image and question
	output = process_images_and_text(image_path, question, self.client)
	except Exception as e:
	print(e)
	# If the image is too large, resize it and try again
	if "Payload Too Large" in str(e):
	new_image_path = resize_image(image_path)
	output = process_images_and_text(new_image_path, question, self.client)

	# Add a note if we generated a caption instead of answering a question
	if add_note:
	output = (
	f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
	)

	return output


	@tool
	def visualizer(image_path: str, question: str \| None = None) -> str:
	"""
	A tool that can answer questions about attached images using OpenAI's GPT-4o model.

	This function provides an alternative implementation using OpenAI's vision capabilities
	instead of the Hugging Face model used in VisualQATool.

	Args:
	image_path: The path to the image on which to answer the question. This should be a local path to downloaded image.
	question: The question to answer.

	Returns:
	str: Answer to the question or a caption for the image
	"""
	import mimetypes
	import os

	import requests

	from .visual_qa import encode_image

	# If no question is provided, default to generating a caption
	add_note = False
	if not question:
	add_note = True
	question = "Please write a detailed caption for this image."

	# Validate input
	if not isinstance(image_path, str):
	raise Exception("You should provide at least `image_path` string argument to this tool!")

	# Determine the MIME type and encode the image
	mime_type, _ = mimetypes.guess_type(image_path)
	base64_image = encode_image(image_path)

	# Prepare the payload for the OpenAI API request
	payload = {
	"model": "gpt-4o", # Using GPT-4o with vision capabilities
	"messages": [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": question},
	{"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}},
	],
	}
	],
	"max_tokens": 1000, # Limit response length
	}

	# Set up headers with API key
	headers = {"Content-Type": "application/json", "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"}

	# Send the request to the OpenAI API
	response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

	# Parse the response
	try:
	output = response.json()["choices"][0]["message"]["content"]
	except Exception:
	raise Exception(f"Response format unexpected: {response.json()}")

	# Add a note if we generated a caption instead of answering a question
	if add_note:
	output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}"

	return output