Spaces:
Running
on
L4
Running
on
L4
| """ | |
| Speech to Text Service | |
| Wrapper untuk Whisper STT | |
| """ | |
| import whisper | |
| import torch | |
| import warnings | |
| import os | |
| from typing import Dict | |
| from app.core.device import get_device, optimize_for_device | |
| warnings.filterwarnings('ignore') | |
| class SpeechToTextService: | |
| """Speech-to-Text service using Whisper""" | |
| def __init__(self, model_name: str = "medium", device: str = None, language: str = "id"): | |
| """Initialize Whisper model""" | |
| print(f"ποΈ Initializing Speech-to-Text service") | |
| print(f"π¦ Loading Whisper model: {model_name}") | |
| # Auto-detect device if not specified | |
| if device is None or device == "auto": | |
| self.device = get_device() | |
| optimize_for_device(self.device) | |
| else: | |
| self.device = device | |
| print(f"π» Using device: {self.device}") | |
| # Check if model is already cached | |
| # Use /data/.cache for Whisper (persistent storage on HF Pro) | |
| cache_dir = os.environ.get('WHISPER_CACHE', '/data/.cache') | |
| model_cache_path = os.path.join(cache_dir, f'{model_name}.pt') | |
| # Load Whisper model | |
| try: | |
| if os.path.exists(model_cache_path): | |
| print(f"β Loading from cache (pre-downloaded during build)") | |
| else: | |
| print(f"π₯ Model not in cache, downloading '{model_name}'...") | |
| print(f" This may take 1-2 minutes...") | |
| self.model = whisper.load_model(model_name, device=self.device, download_root=cache_dir) | |
| print("β Whisper model ready!\n") | |
| except Exception as e: | |
| print(f"β Failed to load model '{model_name}': {e}") | |
| print("βοΈ Falling back to 'base' model...") | |
| base_cache_path = os.path.join(cache_dir, 'base.pt') | |
| if os.path.exists(base_cache_path): | |
| print(f"β Loading base model from cache") | |
| else: | |
| print(f"π₯ Downloading base model...") | |
| self.model = whisper.load_model("base", device=self.device, download_root=cache_dir) | |
| print("β Base model ready!\n") | |
| self.language = language | |
| def transcribe(self, audio_path: str, **kwargs) -> Dict: | |
| """ | |
| Transcribe audio file to text | |
| Args: | |
| audio_path: Path ke file audio | |
| **kwargs: Additional Whisper parameters | |
| Returns: | |
| Dict: {'text': str, 'segments': list, 'language': str} | |
| """ | |
| print(f"π§ Transcribing: {audio_path}") | |
| try: | |
| # Try with word_timestamps first | |
| # Use FP16 for GPU to reduce memory and improve speed | |
| fp16 = self.device == "cuda" | |
| result = self.model.transcribe( | |
| audio_path, | |
| language=self.language, | |
| task="transcribe", | |
| word_timestamps=True, | |
| condition_on_previous_text=False, | |
| fp16=fp16, | |
| **kwargs | |
| ) | |
| except Exception as e: | |
| print(f"β οΈ Transcription with word_timestamps failed: {e}") | |
| print(f"π Retrying without word_timestamps...") | |
| # Fallback: transcribe without word_timestamps | |
| fp16 = self.device == "cuda" | |
| result = self.model.transcribe( | |
| audio_path, | |
| language=self.language, | |
| task="transcribe", | |
| condition_on_previous_text=False, | |
| fp16=fp16, | |
| **kwargs | |
| ) | |
| print("β Transcription complete!\n") | |
| return { | |
| 'text': result['text'], | |
| 'segments': result.get('segments', []), | |
| 'language': result.get('language', self.language) | |
| } | |