Spaces:

tuanhqv123
/

final_agent_course

Running

File size: 11,288 Bytes

"""
AUDIO PROCESSING TOOL - Groq Audio Only
Handles audio file transcription using Groq Whisper API
"""

import os
import tempfile
import requests
from typing import Dict, Any, Optional
from groq import Groq
from .state_manager import get_agent_state

class AudioTool:
    def __init__(self):
        self.client = Groq(api_key=os.environ.get("GROQ_API_KEY", ""))
        self.model = "whisper-large-v3"
        print("🎵 Audio Tool (Groq Whisper) initialized")
    
    def process_audio(self, audio_input: str, **kwargs) -> Dict[str, Any]:
        """
        Process audio files using Groq Whisper API
        Supports URLs, file paths, and base64 audio
        """
        try:
            audio_path = self._prepare_audio_file(audio_input)
            if not audio_path:
                return self._error_result("Could not prepare audio file")
            
            # Transcribe using Groq Whisper
            transcript = self._transcribe_with_groq(audio_path)
            
            # Cleanup temp file if created
            if audio_path.startswith(tempfile.gettempdir()):
                os.unlink(audio_path)
            
            result = {
                "transcript": transcript,
                "source": audio_input,
                "model": self.model,
                "tool": "groq_whisper"
            }
            
            # Update agent state
            state = get_agent_state()
            state.cached_data["audio_analysis"] = result
            
            return {
                "success": True,
                "data": result,
                "summary": f"Audio transcribed: {transcript[:100]}..."
            }
            
        except Exception as e:
            error_msg = f"Audio processing failed: {str(e)}"
            print(f"❌ {error_msg}")
            return self._error_result(error_msg)
    
    def _prepare_audio_file(self, audio_input: str) -> Optional[str]:
        """Prepare audio file for processing"""
        try:
            # If it's a URL, download it
            if audio_input.startswith(('http://', 'https://')):
                return self._download_audio(audio_input)
            
            # If it's a local file path
            if os.path.exists(audio_input):
                return audio_input
            
            # If it's base64, decode it
            if self._is_base64(audio_input):
                return self._decode_base64_audio(audio_input)
            
            return None
            
        except Exception as e:
            print(f"⚠️ Audio prep error: {str(e)}")
            return None
    
    def _download_audio(self, url: str) -> str:
        """Download audio from URL to temp file"""
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        # Create temp file with audio extension
        suffix = '.mp3'  # Default
        if '.' in url:
            suffix = '.' + url.split('.')[-1].split('?')[0]
        
        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
            for chunk in response.iter_content(chunk_size=8192):
                tmp_file.write(chunk)
            return tmp_file.name
    
    def _is_base64(self, s: str) -> bool:
        """Check if string is base64 encoded"""
        import base64
        try:
            if isinstance(s, str):
                s_bytes = bytes(s, 'ascii')
            elif isinstance(s, bytes):
                s_bytes = s
            else:
                return False
            return base64.b64encode(base64.b64decode(s_bytes)) == s_bytes
        except Exception:
            return False
    
    def _decode_base64_audio(self, b64_string: str) -> str:
        """Decode base64 audio to temp file"""
        import base64
        
        audio_data = base64.b64decode(b64_string)
        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as tmp_file:
            tmp_file.write(audio_data)
            return tmp_file.name
    
    def _transcribe_with_groq(self, audio_path: str) -> str:
        """Transcribe audio using Groq Whisper API"""
        with open(audio_path, "rb") as audio_file:
            transcript = self.client.audio.transcriptions.create(
                file=audio_file,
                model=self.model,
                language="en",  # Auto-detect or specify
                response_format="text"
            )
        
        return transcript if isinstance(transcript, str) else transcript.text
    
    def _error_result(self, error_msg: str) -> Dict[str, Any]:
        """Standard error result format"""
        return {
            "success": False,
            "error": error_msg,
            "data": None,
            "summary": f"Audio processing failed: {error_msg}"
        }

def download_audio_file(task_id: str) -> Optional[str]:
    """
    Download audio file from API
    """
    try:
        api_url = "https://agents-course-unit4-scoring.hf.space"
        file_url = f"{api_url}/files/{task_id}"
        
        response = requests.get(file_url, timeout=30)
        if response.status_code == 200:
            # Determine file extension
            content_type = response.headers.get('content-type', '')
            if 'audio' in content_type:
                if 'mp3' in content_type:
                    suffix = '.mp3'
                elif 'wav' in content_type:
                    suffix = '.wav'
                elif 'ogg' in content_type:
                    suffix = '.ogg'
                elif 'm4a' in content_type:
                    suffix = '.m4a'
                else:
                    suffix = '.mp3'  # Default
            else:
                suffix = '.mp3'  # Default for unknown audio types
                
            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
                tmp_file.write(response.content)
                return tmp_file.name
        else:
            return None
    except Exception as e:
        print(f"Error downloading audio: {e}")
        return None

def transcribe_audio_groq(task_id: str = "", audio_path: str = "", language: str = "en") -> str:
    """
    Main function: Transcribe audio với Groq Whisper API - model whisper-large-v3
    
    Args:
        task_id: ID để download file từ API
        audio_path: Đường dẫn file audio local (nếu có)
        language: Ngôn ngữ transcription (default: "en")
        
    Returns:
        Transcribed text
    """
    target_audio_path = None
    
    try:
        # Initialize Groq client
        groq_api_key = os.environ.get("GROQ_API_KEY")
        
        if not groq_api_key:
            return "Error: GROQ_API_KEY not found in environment variables"
        
        groq_client = Groq(api_key=groq_api_key)
        
        # Xác định đường dẫn audio
        if audio_path and os.path.exists(audio_path):
            target_audio_path = audio_path
        elif task_id:
            target_audio_path = download_audio_file(task_id)
            if not target_audio_path:
                return "Error: Could not download audio file"
        else:
            return "Error: No audio path or task_id provided"
        
        # Kiểm tra file audio tồn tại
        if not os.path.exists(target_audio_path):
            return "Error: Audio file not found"
        
        # Transcribe với Groq Whisper
        with open(target_audio_path, "rb") as audio_file:
            transcription = groq_client.audio.transcriptions.create(
                file=(os.path.basename(target_audio_path), audio_file.read()),
                model="whisper-large-v3",
                response_format="text",
                language=language,
                temperature=0.0  # Deterministic results
            )
        
        # Lấy kết quả
        if hasattr(transcription, 'text'):
            result = transcription.text
        else:
            result = str(transcription)
        
        # Cleanup downloaded file nếu cần
        if task_id and target_audio_path != audio_path:
            try:
                os.unlink(target_audio_path)
            except:
                pass
        
        return result.strip()
        
    except Exception as e:
        # Cleanup file nếu có lỗi
        if task_id and target_audio_path and target_audio_path != audio_path:
            try:
                os.unlink(target_audio_path)
            except:
                pass
        
        return f"Audio transcription error: {str(e)}"

def transcribe_audio_with_details(task_id: str = "", audio_path: str = "", language: str = "en") -> dict:
    """
    Transcribe audio với thêm chi tiết metadata
    
    Returns:
        Dict chứa transcription và metadata
    """
    try:
        # Lấy transcription
        text = transcribe_audio_groq(task_id, audio_path, language)
        
        # Metadata cơ bản
        metadata = {
            "model": "whisper-large-v3",
            "language": language,
            "provider": "groq"
        }
        
        # Nếu có file local, lấy thêm thông tin
        if audio_path and os.path.exists(audio_path):
            file_size = os.path.getsize(audio_path)
            metadata["file_size"] = file_size
            metadata["file_path"] = audio_path
        
        return {
            "transcription": text,
            "metadata": metadata,
            "success": not text.startswith("Error:")
        }
        
    except Exception as e:
        return {
            "transcription": f"Error: {str(e)}",
            "metadata": {},
            "success": False
        }

# Fallback function nếu Groq không khả dụng
def fallback_audio_info(task_id: str = "", audio_path: str = "") -> str:
    """
    Fallback function khi không thể transcribe audio
    """
    try:
        target_audio_path = None
        
        if audio_path and os.path.exists(audio_path):
            target_audio_path = audio_path
        elif task_id:
            target_audio_path = download_audio_file(task_id)
            if not target_audio_path:
                return "Error: Could not download audio file"
        else:
            return "Error: No audio path or task_id provided"
        
        # Basic file info
        file_size = os.path.getsize(target_audio_path)
        result = f"Audio file detected - Size: {file_size} bytes. Groq transcription not available. Please describe the audio content."
        
        # Cleanup
        if task_id and target_audio_path != audio_path:
            try:
                os.unlink(target_audio_path)
            except:
                pass
                
        return result
        
    except Exception as e:
        return f"Audio processing error: {str(e)}"

# Test function
if __name__ == "__main__":
    # Test với file audio local (nếu có)
    test_audio = "/path/to/test/audio.mp3"
    if os.path.exists(test_audio):
        result = transcribe_audio_groq(audio_path=test_audio)
        print("Transcription Result:", result)
    else:
        print("No test audio found")
        
        # Test với task_id (cần API key)
        # result = transcribe_audio_groq(task_id="some_task_id")
        # print("Transcription Result:", result)