GAIA_benchmark_agent / src /tools /image_processing_tools.py
gabriel-melki
Modify package structure
860424e
from PIL import Image
from transformers import BlipProcessor, BlipForQuestionAnswering
from smolagents.tools import tool
import torch
import requests
import os
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
def _download_file(file_name: str) -> None:
"""Download file if it doesn't exist locally"""
try:
# Try to open the file to check if it exists
with open(file_name, 'rb') as f:
pass # File exists, do nothing
except FileNotFoundError:
# File doesn't exist, download it
url = f"{DEFAULT_API_URL}/files/{file_name.split('.')[-2]}"
r = requests.get(url)
with open(file_name, "wb") as f:
f.write(r.content)
@tool
def ask_question_about_image(question: str, path_to_image: str) -> str:
"""
Ask a question about an image and return the answer.
Args:
question: the question to ask about the image.
path_to_image: The path to the image to ask the question about.
Returns:
A string with the answer to the question.
"""
# Download the file if it doesn't exist
_download_file(path_to_image)
# Check if CUDA is available and use GPU if possible, otherwise use CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Load the processor and model (using BLIP for more stable VQA)
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
model = model.to(device)
# Load and process the image
image = Image.open(path_to_image).convert('RGB')
# Process the inputs
inputs = processor(image, question, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
# Generate the answer
with torch.no_grad():
outputs = model.generate(**inputs, max_length=50, num_beams=5)
# Decode and return the answer
answer = processor.decode(outputs[0], skip_special_tokens=True)
return answer