File size: 11,288 Bytes
92d2175
040a6c6
 
92d2175
 
 
 
 
040a6c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d2175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
"""
AUDIO PROCESSING TOOL - Groq Audio Only
Handles audio file transcription using Groq Whisper API
"""

import os
import tempfile
import requests
from typing import Dict, Any, Optional
from groq import Groq
from .state_manager import get_agent_state

class AudioTool:
    def __init__(self):
        self.client = Groq(api_key=os.environ.get("GROQ_API_KEY", ""))
        self.model = "whisper-large-v3"
        print("🎵 Audio Tool (Groq Whisper) initialized")
    
    def process_audio(self, audio_input: str, **kwargs) -> Dict[str, Any]:
        """
        Process audio files using Groq Whisper API
        Supports URLs, file paths, and base64 audio
        """
        try:
            audio_path = self._prepare_audio_file(audio_input)
            if not audio_path:
                return self._error_result("Could not prepare audio file")
            
            # Transcribe using Groq Whisper
            transcript = self._transcribe_with_groq(audio_path)
            
            # Cleanup temp file if created
            if audio_path.startswith(tempfile.gettempdir()):
                os.unlink(audio_path)
            
            result = {
                "transcript": transcript,
                "source": audio_input,
                "model": self.model,
                "tool": "groq_whisper"
            }
            
            # Update agent state
            state = get_agent_state()
            state.cached_data["audio_analysis"] = result
            
            return {
                "success": True,
                "data": result,
                "summary": f"Audio transcribed: {transcript[:100]}..."
            }
            
        except Exception as e:
            error_msg = f"Audio processing failed: {str(e)}"
            print(f"❌ {error_msg}")
            return self._error_result(error_msg)
    
    def _prepare_audio_file(self, audio_input: str) -> Optional[str]:
        """Prepare audio file for processing"""
        try:
            # If it's a URL, download it
            if audio_input.startswith(('http://', 'https://')):
                return self._download_audio(audio_input)
            
            # If it's a local file path
            if os.path.exists(audio_input):
                return audio_input
            
            # If it's base64, decode it
            if self._is_base64(audio_input):
                return self._decode_base64_audio(audio_input)
            
            return None
            
        except Exception as e:
            print(f"⚠️ Audio prep error: {str(e)}")
            return None
    
    def _download_audio(self, url: str) -> str:
        """Download audio from URL to temp file"""
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        # Create temp file with audio extension
        suffix = '.mp3'  # Default
        if '.' in url:
            suffix = '.' + url.split('.')[-1].split('?')[0]
        
        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
            for chunk in response.iter_content(chunk_size=8192):
                tmp_file.write(chunk)
            return tmp_file.name
    
    def _is_base64(self, s: str) -> bool:
        """Check if string is base64 encoded"""
        import base64
        try:
            if isinstance(s, str):
                s_bytes = bytes(s, 'ascii')
            elif isinstance(s, bytes):
                s_bytes = s
            else:
                return False
            return base64.b64encode(base64.b64decode(s_bytes)) == s_bytes
        except Exception:
            return False
    
    def _decode_base64_audio(self, b64_string: str) -> str:
        """Decode base64 audio to temp file"""
        import base64
        
        audio_data = base64.b64decode(b64_string)
        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as tmp_file:
            tmp_file.write(audio_data)
            return tmp_file.name
    
    def _transcribe_with_groq(self, audio_path: str) -> str:
        """Transcribe audio using Groq Whisper API"""
        with open(audio_path, "rb") as audio_file:
            transcript = self.client.audio.transcriptions.create(
                file=audio_file,
                model=self.model,
                language="en",  # Auto-detect or specify
                response_format="text"
            )
        
        return transcript if isinstance(transcript, str) else transcript.text
    
    def _error_result(self, error_msg: str) -> Dict[str, Any]:
        """Standard error result format"""
        return {
            "success": False,
            "error": error_msg,
            "data": None,
            "summary": f"Audio processing failed: {error_msg}"
        }

def download_audio_file(task_id: str) -> Optional[str]:
    """
    Download audio file from API
    """
    try:
        api_url = "https://agents-course-unit4-scoring.hf.space"
        file_url = f"{api_url}/files/{task_id}"
        
        response = requests.get(file_url, timeout=30)
        if response.status_code == 200:
            # Determine file extension
            content_type = response.headers.get('content-type', '')
            if 'audio' in content_type:
                if 'mp3' in content_type:
                    suffix = '.mp3'
                elif 'wav' in content_type:
                    suffix = '.wav'
                elif 'ogg' in content_type:
                    suffix = '.ogg'
                elif 'm4a' in content_type:
                    suffix = '.m4a'
                else:
                    suffix = '.mp3'  # Default
            else:
                suffix = '.mp3'  # Default for unknown audio types
                
            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
                tmp_file.write(response.content)
                return tmp_file.name
        else:
            return None
    except Exception as e:
        print(f"Error downloading audio: {e}")
        return None

def transcribe_audio_groq(task_id: str = "", audio_path: str = "", language: str = "en") -> str:
    """
    Main function: Transcribe audio với Groq Whisper API - model whisper-large-v3
    
    Args:
        task_id: ID để download file từ API
        audio_path: Đường dẫn file audio local (nếu có)
        language: Ngôn ngữ transcription (default: "en")
        
    Returns:
        Transcribed text
    """
    target_audio_path = None
    
    try:
        # Initialize Groq client
        groq_api_key = os.environ.get("GROQ_API_KEY")
        
        if not groq_api_key:
            return "Error: GROQ_API_KEY not found in environment variables"
        
        groq_client = Groq(api_key=groq_api_key)
        
        # Xác định đường dẫn audio
        if audio_path and os.path.exists(audio_path):
            target_audio_path = audio_path
        elif task_id:
            target_audio_path = download_audio_file(task_id)
            if not target_audio_path:
                return "Error: Could not download audio file"
        else:
            return "Error: No audio path or task_id provided"
        
        # Kiểm tra file audio tồn tại
        if not os.path.exists(target_audio_path):
            return "Error: Audio file not found"
        
        # Transcribe với Groq Whisper
        with open(target_audio_path, "rb") as audio_file:
            transcription = groq_client.audio.transcriptions.create(
                file=(os.path.basename(target_audio_path), audio_file.read()),
                model="whisper-large-v3",
                response_format="text",
                language=language,
                temperature=0.0  # Deterministic results
            )
        
        # Lấy kết quả
        if hasattr(transcription, 'text'):
            result = transcription.text
        else:
            result = str(transcription)
        
        # Cleanup downloaded file nếu cần
        if task_id and target_audio_path != audio_path:
            try:
                os.unlink(target_audio_path)
            except:
                pass
        
        return result.strip()
        
    except Exception as e:
        # Cleanup file nếu có lỗi
        if task_id and target_audio_path and target_audio_path != audio_path:
            try:
                os.unlink(target_audio_path)
            except:
                pass
        
        return f"Audio transcription error: {str(e)}"

def transcribe_audio_with_details(task_id: str = "", audio_path: str = "", language: str = "en") -> dict:
    """
    Transcribe audio với thêm chi tiết metadata
    
    Returns:
        Dict chứa transcription và metadata
    """
    try:
        # Lấy transcription
        text = transcribe_audio_groq(task_id, audio_path, language)
        
        # Metadata cơ bản
        metadata = {
            "model": "whisper-large-v3",
            "language": language,
            "provider": "groq"
        }
        
        # Nếu có file local, lấy thêm thông tin
        if audio_path and os.path.exists(audio_path):
            file_size = os.path.getsize(audio_path)
            metadata["file_size"] = file_size
            metadata["file_path"] = audio_path
        
        return {
            "transcription": text,
            "metadata": metadata,
            "success": not text.startswith("Error:")
        }
        
    except Exception as e:
        return {
            "transcription": f"Error: {str(e)}",
            "metadata": {},
            "success": False
        }

# Fallback function nếu Groq không khả dụng
def fallback_audio_info(task_id: str = "", audio_path: str = "") -> str:
    """
    Fallback function khi không thể transcribe audio
    """
    try:
        target_audio_path = None
        
        if audio_path and os.path.exists(audio_path):
            target_audio_path = audio_path
        elif task_id:
            target_audio_path = download_audio_file(task_id)
            if not target_audio_path:
                return "Error: Could not download audio file"
        else:
            return "Error: No audio path or task_id provided"
        
        # Basic file info
        file_size = os.path.getsize(target_audio_path)
        result = f"Audio file detected - Size: {file_size} bytes. Groq transcription not available. Please describe the audio content."
        
        # Cleanup
        if task_id and target_audio_path != audio_path:
            try:
                os.unlink(target_audio_path)
            except:
                pass
                
        return result
        
    except Exception as e:
        return f"Audio processing error: {str(e)}"

# Test function
if __name__ == "__main__":
    # Test với file audio local (nếu có)
    test_audio = "/path/to/test/audio.mp3"
    if os.path.exists(test_audio):
        result = transcribe_audio_groq(audio_path=test_audio)
        print("Transcription Result:", result)
    else:
        print("No test audio found")
        
        # Test với task_id (cần API key)
        # result = transcribe_audio_groq(task_id="some_task_id")
        # print("Transcription Result:", result)