Spaces:

Cyberlace
/

api-swara-audio-analysis

Paused

File size: 3,910 Bytes

"""
Speech to Text Service
Wrapper untuk Whisper STT
"""

import whisper
import torch
import warnings
import os
from typing import Dict
from app.core.device import get_device, optimize_for_device
warnings.filterwarnings('ignore')


class SpeechToTextService:
    """Speech-to-Text service using Whisper"""
    
    def __init__(self, model_name: str = "medium", device: str = None, language: str = "id"):
        """Initialize Whisper model"""
        print(f"🎙️ Initializing Speech-to-Text service")
        print(f"📦 Loading Whisper model: {model_name}")
        
        # Auto-detect device if not specified
        if device is None or device == "auto":
            self.device = get_device()
            optimize_for_device(self.device)
        else:
            self.device = device
            print(f"💻 Using device: {self.device}")
        
        # Check if model is already cached
        # Use /data/.cache for Whisper (persistent storage on HF Pro)
        cache_dir = os.environ.get('WHISPER_CACHE', '/data/.cache')
        model_cache_path = os.path.join(cache_dir, f'{model_name}.pt')
        
        # Load Whisper model
        try:
            if os.path.exists(model_cache_path):
                print(f"✅ Loading from cache (pre-downloaded during build)")
            else:
                print(f"📥 Model not in cache, downloading '{model_name}'...")
                print(f"   This may take 1-2 minutes...")
            
            self.model = whisper.load_model(model_name, device=self.device, download_root=cache_dir)
            print("✅ Whisper model ready!\n")
        except Exception as e:
            print(f"❌ Failed to load model '{model_name}': {e}")
            print("⚙️ Falling back to 'base' model...")
            
            base_cache_path = os.path.join(cache_dir, 'base.pt')
            if os.path.exists(base_cache_path):
                print(f"✅ Loading base model from cache")
            else:
                print(f"📥 Downloading base model...")
            
            self.model = whisper.load_model("base", device=self.device, download_root=cache_dir)
            print("✅ Base model ready!\n")
        
        self.language = language
    
    def transcribe(self, audio_path: str, **kwargs) -> Dict:
        """
        Transcribe audio file to text
        
        Args:
            audio_path: Path ke file audio
            **kwargs: Additional Whisper parameters
            
        Returns:
            Dict: {'text': str, 'segments': list, 'language': str}
        """
        print(f"🎧 Transcribing: {audio_path}")
        
        try:
            # Try with word_timestamps first
            # Use FP16 for GPU to reduce memory and improve speed
            fp16 = self.device == "cuda"
            
            result = self.model.transcribe(
                audio_path,
                language=self.language,
                task="transcribe",
                word_timestamps=True,
                condition_on_previous_text=False,
                fp16=fp16,
                **kwargs
            )
        except Exception as e:
            print(f"⚠️ Transcription with word_timestamps failed: {e}")
            print(f"🔄 Retrying without word_timestamps...")
            
            # Fallback: transcribe without word_timestamps
            fp16 = self.device == "cuda"
            
            result = self.model.transcribe(
                audio_path,
                language=self.language,
                task="transcribe",
                condition_on_previous_text=False,
                fp16=fp16,
                **kwargs
            )
        
        print("✅ Transcription complete!\n")
        
        return {
            'text': result['text'],
            'segments': result.get('segments', []),
            'language': result.get('language', self.language)
        }