|
import os |
|
import base64 |
|
import requests |
|
from smolagents import Tool |
|
|
|
class ImageAnalysisTool(Tool): |
|
name = "image_analysis" |
|
description = "Analyze the content of an image and answer a specific question about it using Hugging Face Inference API." |
|
inputs = { |
|
"image_path": { |
|
"type": "string", |
|
"description": "Path to the image file (jpg, png, etc.)" |
|
}, |
|
"question": { |
|
"type": "string", |
|
"description": "A question about the image content" |
|
} |
|
} |
|
output_type = "string" |
|
|
|
def __init__(self): |
|
super().__init__() |
|
api_token = os.getenv("HF_API_TOKEN") |
|
if not api_token: |
|
raise EnvironmentError("HF_API_TOKEN not found in environment variables.") |
|
self.api_url = "https://api-inference.huggingface.co/models/microsoft/git-base-captioning" |
|
self.headers = { |
|
"Authorization": f"Bearer {api_token}", |
|
"Content-Type": "application/json" |
|
} |
|
|
|
def forward(self, image_path: str, question: str) -> str: |
|
try: |
|
with open(image_path, "rb") as img_file: |
|
image_bytes = img_file.read() |
|
|
|
|
|
img_b64 = base64.b64encode(image_bytes).decode("utf-8") |
|
|
|
|
|
|
|
payload = { |
|
"inputs": img_b64 |
|
} |
|
|
|
response = requests.post( |
|
self.api_url, |
|
headers=self.headers, |
|
json=payload, |
|
timeout=60 |
|
) |
|
|
|
if response.status_code == 200: |
|
result = response.json() |
|
|
|
caption = None |
|
|
|
if isinstance(result, dict): |
|
caption = result.get("generated_text") or result.get("caption") or result.get("text") |
|
elif isinstance(result, list) and len(result) > 0 and isinstance(result[0], dict): |
|
caption = result[0].get("generated_text") or result[0].get("caption") or result[0].get("text") |
|
|
|
if not caption: |
|
return "Error: No caption found in model response." |
|
|
|
|
|
answer = f"Caption: {caption}\nAnswer to question '{question}': {caption}" |
|
return answer.strip() |
|
|
|
else: |
|
return f"Error analyzing image: {response.status_code} {response.text}" |
|
|
|
except Exception as e: |
|
return f"Error analyzing image: {e}" |
|
|
|
|
|
|
|
|
|
|
|
|