GAIA_benchmark_agent

Sleeping

GAIA_benchmark_agent / src /tools /image_processing_tools.py

gabriel-melki

Modify package structure

860424e 3 months ago

2.04 kB

	from PIL import Image
	from transformers import BlipProcessor, BlipForQuestionAnswering
	from smolagents.tools import tool
	import torch
	import requests
	import os

	DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"

	def _download_file(file_name: str) -> None:
	"""Download file if it doesn't exist locally"""
	try:
	# Try to open the file to check if it exists
	with open(file_name, 'rb') as f:
	pass # File exists, do nothing
	except FileNotFoundError:
	# File doesn't exist, download it
	url = f"{DEFAULT_API_URL}/files/{file_name.split('.')[-2]}"
	r = requests.get(url)
	with open(file_name, "wb") as f:
	f.write(r.content)

	@tool
	def ask_question_about_image(question: str, path_to_image: str) -> str:
	"""
	Ask a question about an image and return the answer.
	Args:
	question: the question to ask about the image.
	path_to_image: The path to the image to ask the question about.
	Returns:
	A string with the answer to the question.
	"""
	# Download the file if it doesn't exist
	_download_file(path_to_image)

	# Check if CUDA is available and use GPU if possible, otherwise use CPU
	device = 'cuda' if torch.cuda.is_available() else 'cpu'

	# Load the processor and model (using BLIP for more stable VQA)
	processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
	model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
	model = model.to(device)

	# Load and process the image
	image = Image.open(path_to_image).convert('RGB')

	# Process the inputs
	inputs = processor(image, question, return_tensors="pt")
	inputs = {k: v.to(device) for k, v in inputs.items()}

	# Generate the answer
	with torch.no_grad():
	outputs = model.generate(**inputs, max_length=50, num_beams=5)

	# Decode and return the answer
	answer = processor.decode(outputs[0], skip_special_tokens=True)

	return answer