File size: 6,528 Bytes
c66530c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# tools/multimodal_tools.py
import requests
import json
from typing import Optional, Dict, Any
from .utils import encode_image_to_base64, validate_file_exists, get_env_var, logger

class MultimodalTools:
    """Free multimodal AI tools using OpenRouter and other free services"""
    
    def __init__(self, openrouter_key: Optional[str] = None):
        self.openrouter_key = openrouter_key or get_env_var("OPENROUTER_API_KEY", None)
        self.openrouter_url = "https://openrouter.ai/api/v1/chat/completions"
        self.headers = {
            "Authorization": f"Bearer {self.openrouter_key}",
            "Content-Type": "application/json",
            "HTTP-Referer": "https://your-app.com",  # Optional: for analytics
            "X-Title": "Multimodal Tools"  # Optional: for analytics
        }
        
        # Available free multimodal models
        self.vision_model = "moonshotai/kimi-vl-a3b-thinking:free"
        self.text_model = "meta-llama/llama-4-maverick:free"
    
    def _make_openrouter_request(self, payload: Dict[str, Any]) -> str:
        """Make request to OpenRouter API with error handling"""
        try:
            response = requests.post(
                self.openrouter_url, 
                headers=self.headers, 
                json=payload,
                timeout=30
            )
            response.raise_for_status()
            
            result = response.json()
            if 'choices' in result and len(result['choices']) > 0:
                return result['choices'][0]['message']['content']
            else:
                logger.error(f"Unexpected response format: {result}")
                return "Error: Invalid response format"
                
        except requests.exceptions.RequestException as e:
            logger.error(f"OpenRouter API request failed: {str(e)}")
            return f"Error making API request: {str(e)}"
        except Exception as e:
            logger.error(f"Unexpected error: {str(e)}")
            return f"Unexpected error: {str(e)}"
    
    def analyze_image(self, image_path: str, question: str = "Describe this image in detail") -> str:
        """

        Analyze image content using multimodal AI

        

        Args:

            image_path: Path to image file

            question: Question about the image

            

        Returns:

            AI analysis of the image

        """
        if not validate_file_exists(image_path):
            return f"Error: Image file not found at {image_path}"
        
        try:
            encoded_image = encode_image_to_base64(image_path)
            
            payload = {
                "model": self.vision_model,
                "messages": [
                    {
                        "role": "user", 
                        "content": [
                            {"type": "text", "text": question},
                            {
                                "type": "image_url", 
                                "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}
                            }
                        ]
                    }
                ],
                "temperature": 0,
                "max_tokens": 1024
            }
            
            return self._make_openrouter_request(payload)
            
        except Exception as e:
            error_msg = f"Error analyzing image: {str(e)}"
            logger.error(error_msg)
            return error_msg
    
    def extract_text_from_image(self, image_path: str) -> str:
        """

        Extract text from image using OCR via multimodal AI

        

        Args:

            image_path: Path to image file

            

        Returns:

            Extracted text from image

        """
        ocr_prompt = """Extract all visible text from this image. 

        Return only the text content without any additional commentary or formatting. 

        If no text is visible, return 'No text found'."""
        
        return self.analyze_image(image_path, ocr_prompt)
    
    def analyze_audio_transcript(self, transcript: str, question: str = "Summarize this audio content") -> str:
        """

        Analyze audio content via transcript

        

        Args:

            transcript: Audio transcript text

            question: Question about the audio content

            

        Returns:

            AI analysis of the audio content

        """
        if not transcript.strip():
            return "Error: Empty transcript provided"
        
        try:
            payload = {
                "model": self.text_model,
                "messages": [
                    {
                        "role": "user", 
                        "content": f"Audio transcript: {transcript}\n\nQuestion: {question}"
                    }
                ],
                "temperature": 0,
                "max_tokens": 1024
            }
            
            return self._make_openrouter_request(payload)
            
        except Exception as e:
            error_msg = f"Error analyzing audio transcript: {str(e)}"
            logger.error(error_msg)
            return error_msg
    
    def describe_image(self, image_path: str) -> str:
        """Get a detailed description of an image"""
        return self.analyze_image(
            image_path, 
            "Provide a detailed, objective description of this image including objects, people, colors, setting, and any notable details."
        )
    
    def answer_visual_question(self, image_path: str, question: str) -> str:
        """Answer a specific question about an image"""
        return self.analyze_image(image_path, question)

# Convenience functions for direct use
def analyze_image(image_path: str, question: str = "Describe this image in detail") -> str:
    """Standalone function to analyze an image"""
    tools = MultimodalTools()
    return tools.analyze_image(image_path, question)

def extract_text(image_path: str) -> str:
    """Standalone function to extract text from an image"""
    tools = MultimodalTools()
    return tools.extract_text_from_image(image_path)

def analyze_transcript(transcript: str, question: str = "Summarize this content") -> str:
    """Standalone function to analyze audio transcript"""
    tools = MultimodalTools()
    return tools.analyze_audio_transcript(transcript, question)