File size: 6,528 Bytes
c66530c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
# tools/multimodal_tools.py
import requests
import json
from typing import Optional, Dict, Any
from .utils import encode_image_to_base64, validate_file_exists, get_env_var, logger
class MultimodalTools:
"""Free multimodal AI tools using OpenRouter and other free services"""
def __init__(self, openrouter_key: Optional[str] = None):
self.openrouter_key = openrouter_key or get_env_var("OPENROUTER_API_KEY", None)
self.openrouter_url = "https://openrouter.ai/api/v1/chat/completions"
self.headers = {
"Authorization": f"Bearer {self.openrouter_key}",
"Content-Type": "application/json",
"HTTP-Referer": "https://your-app.com", # Optional: for analytics
"X-Title": "Multimodal Tools" # Optional: for analytics
}
# Available free multimodal models
self.vision_model = "moonshotai/kimi-vl-a3b-thinking:free"
self.text_model = "meta-llama/llama-4-maverick:free"
def _make_openrouter_request(self, payload: Dict[str, Any]) -> str:
"""Make request to OpenRouter API with error handling"""
try:
response = requests.post(
self.openrouter_url,
headers=self.headers,
json=payload,
timeout=30
)
response.raise_for_status()
result = response.json()
if 'choices' in result and len(result['choices']) > 0:
return result['choices'][0]['message']['content']
else:
logger.error(f"Unexpected response format: {result}")
return "Error: Invalid response format"
except requests.exceptions.RequestException as e:
logger.error(f"OpenRouter API request failed: {str(e)}")
return f"Error making API request: {str(e)}"
except Exception as e:
logger.error(f"Unexpected error: {str(e)}")
return f"Unexpected error: {str(e)}"
def analyze_image(self, image_path: str, question: str = "Describe this image in detail") -> str:
"""
Analyze image content using multimodal AI
Args:
image_path: Path to image file
question: Question about the image
Returns:
AI analysis of the image
"""
if not validate_file_exists(image_path):
return f"Error: Image file not found at {image_path}"
try:
encoded_image = encode_image_to_base64(image_path)
payload = {
"model": self.vision_model,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": question},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}
}
]
}
],
"temperature": 0,
"max_tokens": 1024
}
return self._make_openrouter_request(payload)
except Exception as e:
error_msg = f"Error analyzing image: {str(e)}"
logger.error(error_msg)
return error_msg
def extract_text_from_image(self, image_path: str) -> str:
"""
Extract text from image using OCR via multimodal AI
Args:
image_path: Path to image file
Returns:
Extracted text from image
"""
ocr_prompt = """Extract all visible text from this image.
Return only the text content without any additional commentary or formatting.
If no text is visible, return 'No text found'."""
return self.analyze_image(image_path, ocr_prompt)
def analyze_audio_transcript(self, transcript: str, question: str = "Summarize this audio content") -> str:
"""
Analyze audio content via transcript
Args:
transcript: Audio transcript text
question: Question about the audio content
Returns:
AI analysis of the audio content
"""
if not transcript.strip():
return "Error: Empty transcript provided"
try:
payload = {
"model": self.text_model,
"messages": [
{
"role": "user",
"content": f"Audio transcript: {transcript}\n\nQuestion: {question}"
}
],
"temperature": 0,
"max_tokens": 1024
}
return self._make_openrouter_request(payload)
except Exception as e:
error_msg = f"Error analyzing audio transcript: {str(e)}"
logger.error(error_msg)
return error_msg
def describe_image(self, image_path: str) -> str:
"""Get a detailed description of an image"""
return self.analyze_image(
image_path,
"Provide a detailed, objective description of this image including objects, people, colors, setting, and any notable details."
)
def answer_visual_question(self, image_path: str, question: str) -> str:
"""Answer a specific question about an image"""
return self.analyze_image(image_path, question)
# Convenience functions for direct use
def analyze_image(image_path: str, question: str = "Describe this image in detail") -> str:
"""Standalone function to analyze an image"""
tools = MultimodalTools()
return tools.analyze_image(image_path, question)
def extract_text(image_path: str) -> str:
"""Standalone function to extract text from an image"""
tools = MultimodalTools()
return tools.extract_text_from_image(image_path)
def analyze_transcript(transcript: str, question: str = "Summarize this content") -> str:
"""Standalone function to analyze audio transcript"""
tools = MultimodalTools()
return tools.analyze_audio_transcript(transcript, question)
|