Spaces:

DroolingPanda
/

teachingAssistant

Sleeping

App Files Files Community

Michael Hu commited on 24 days ago

Commit

619b266

1 Parent(s): c8d736e

refactor: remove parakeet ASR provider and update all references to Whisper only

Browse files

Files changed (9) hide show

src/application/dtos/processing_request_dto.py +1 -1
src/application/services/audio_processing_service.py +1 -1
src/application/services/configuration_service.py +2 -2
src/domain/interfaces/speech_recognition.py +1 -2
src/infrastructure/config/app_config.py +2 -2
src/infrastructure/stt/__init__.py +0 -2
src/infrastructure/stt/legacy_compatibility.py +2 -2
src/infrastructure/stt/parakeet_provider.py +0 -168
src/infrastructure/stt/provider_factory.py +3 -6

src/application/dtos/processing_request_dto.py CHANGED Viewed

@@ -35,7 +35,7 @@ class ProcessingRequestDto:
             raise ValueError("ASR model cannot be empty")
         # Validate ASR model options
-        supported_asr_models = ['parakeet', 'whisper-small', 'whisper-medium', 'whisper-large']
         if self.asr_model not in supported_asr_models:
             raise ValueError(f"Unsupported ASR model: {self.asr_model}. Supported: {supported_asr_models}")

             raise ValueError("ASR model cannot be empty")
         # Validate ASR model options
+        supported_asr_models = ['whisper-small', 'whisper-medium', 'whisper-large']
         if self.asr_model not in supported_asr_models:
             raise ValueError(f"Unsupported ASR model: {self.asr_model}. Supported: {supported_asr_models}")

src/application/services/audio_processing_service.py CHANGED Viewed

@@ -634,7 +634,7 @@ class AudioProcessingApplicationService:
             Dict[str, Any]: Supported configurations
         """
         return {
-            'asr_models': ['parakeet', 'whisper-large'],
             'voices': ['chatterbox'],
             'languages': ['en', 'zh'],
             'audio_formats': self._config.get_processing_config()['supported_audio_formats'],

             Dict[str, Any]: Supported configurations
         """
         return {
+            'asr_models': ['whisper-large'],
             'voices': ['chatterbox'],
             'languages': ['en', 'zh'],
             'audio_formats': self._config.get_processing_config()['supported_audio_formats'],

src/application/services/configuration_service.py CHANGED Viewed

@@ -331,7 +331,7 @@ class ConfigurationApplicationService:
         Raises:
             ConfigurationException: If validation fails
         """
-        valid_providers = ['whisper', 'parakeet']
         for key, value in updates.items():
             if key == 'preferred_providers':
@@ -524,7 +524,7 @@ class ConfigurationApplicationService:
             # Check STT providers
             stt_factory = self._container.resolve(type(self._container._get_stt_factory()))
-            for provider in ['whisper', 'parakeet']:
                 try:
                     stt_factory.create_provider(provider)
                     availability['stt'][provider] = True

         Raises:
             ConfigurationException: If validation fails
         """
+        valid_providers = ['whisper']
         for key, value in updates.items():
             if key == 'preferred_providers':
             # Check STT providers
             stt_factory = self._container.resolve(type(self._container._get_stt_factory()))
+            for provider in ['whisper']:
                 try:
                     stt_factory.create_provider(provider)
                     availability['stt'][provider] = True

src/domain/interfaces/speech_recognition.py CHANGED Viewed

@@ -5,7 +5,7 @@ audio content into textual representation. The interface supports multiple STT
 models and providers with consistent error handling.
 The interface is designed to be:
-- Model-agnostic: Works with any STT implementation (Whisper, Parakeet, etc.)
 - Language-aware: Handles multiple languages and dialects
 - Error-resilient: Provides detailed error information for debugging
 - Performance-conscious: Supports both batch and streaming transcription
@@ -65,7 +65,6 @@ class ISpeechRecognitionService(ABC):
             model: The STT model identifier to use for transcription. Examples:
                   - "whisper-small": Fast, lower accuracy
                   - "whisper-large": Slower, higher accuracy
-                  - "parakeet": Real-time optimized
                   Must be supported by the implementation.
         Returns:

 models and providers with consistent error handling.
 The interface is designed to be:
+- Model-agnostic: Works with any STT implementation (Whisper, etc.)
 - Language-aware: Handles multiple languages and dialects
 - Error-resilient: Provides detailed error information for debugging
 - Performance-conscious: Supports both batch and streaming transcription
             model: The STT model identifier to use for transcription. Examples:
                   - "whisper-small": Fast, lower accuracy
                   - "whisper-large": Slower, higher accuracy
                   Must be supported by the implementation.
         Returns:

src/infrastructure/config/app_config.py CHANGED Viewed

@@ -23,8 +23,8 @@ class TTSConfig:
 @dataclass
 class STTConfig:
     """Configuration for STT providers."""
-    preferred_providers: List[str] = field(default_factory=lambda: ['parakeet', 'whisper'])
-    default_model: str = 'parakeet'
     chunk_length_s: int = 30
     batch_size: int = 16
     enable_vad: bool = True

 @dataclass
 class STTConfig:
     """Configuration for STT providers."""
+    preferred_providers: List[str] = field(default_factory=lambda: ['whisper'])
+    default_model: str = 'whisper'
     chunk_length_s: int = 30
     batch_size: int = 16
     enable_vad: bool = True

src/infrastructure/stt/__init__.py CHANGED Viewed

@@ -1,13 +1,11 @@
 """STT provider implementations."""
 from .whisper_provider import WhisperSTTProvider
-from .parakeet_provider import ParakeetSTTProvider
 from .provider_factory import STTProviderFactory, ASRFactory
 from .legacy_compatibility import transcribe_audio, create_audio_content_from_file
 __all__ = [
     'WhisperSTTProvider',
-    'ParakeetSTTProvider',
     'STTProviderFactory',
     'ASRFactory',
     'transcribe_audio',

 """STT provider implementations."""
 from .whisper_provider import WhisperSTTProvider
 from .provider_factory import STTProviderFactory, ASRFactory
 from .legacy_compatibility import transcribe_audio, create_audio_content_from_file
 __all__ = [
     'WhisperSTTProvider',
     'STTProviderFactory',
     'ASRFactory',
     'transcribe_audio',

src/infrastructure/stt/legacy_compatibility.py CHANGED Viewed

@@ -11,7 +11,7 @@ from ...domain.exceptions import SpeechRecognitionException
 logger = logging.getLogger(__name__)
-def transcribe_audio(audio_path: Union[str, Path], model_name: str = "parakeet") -> str:
     """
     Convert audio file to text using specified STT model (legacy interface).
@@ -19,7 +19,7 @@ def transcribe_audio(audio_path: Union[str, Path], model_name: str = "parakeet")
     Args:
         audio_path: Path to input audio file
-        model_name: Name of the STT model/provider to use (whisper or parakeet)
     Returns:
         str: Transcribed English text

 logger = logging.getLogger(__name__)
+def transcribe_audio(audio_path: Union[str, Path], model_name: str = "whisper") -> str:
     """
     Convert audio file to text using specified STT model (legacy interface).
     Args:
         audio_path: Path to input audio file
+        model_name: Name of the STT model/provider to use (whisper)
     Returns:
         str: Transcribed English text

src/infrastructure/stt/parakeet_provider.py DELETED Viewed

@@ -1,168 +0,0 @@
-"""Parakeet STT provider implementation using Hugging Face Transformers."""
-import logging
-import torch
-import librosa
-from pathlib import Path
-from typing import TYPE_CHECKING, Optional, Tuple
-if TYPE_CHECKING:
-    from ...domain.models.audio_content import AudioContent
-    from ...domain.models.text_content import TextContent
-from ..base.stt_provider_base import STTProviderBase
-from ...domain.exceptions import SpeechRecognitionException
-logger = logging.getLogger(__name__)
-class ParakeetSTTProvider(STTProviderBase):
-    """Parakeet STT provider using Hugging Face Transformers CTC model."""
-    def __init__(self):
-        """Initialize the Parakeet STT provider."""
-        super().__init__(
-            provider_name="Parakeet",
-            supported_languages=["en"]  # Parakeet primarily supports English
-        )
-        self.model = None
-        self.processor = None
-        self.current_model_name = None
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-    def _perform_transcription(self, audio_path: Path, model: str) -> str:
-        """
-        Perform transcription using Parakeet CTC model.
-        Args:
-            audio_path: Path to the preprocessed audio file
-            model: The Parakeet model to use
-        Returns:
-            str: The transcribed text
-        """
-        try:
-            # Load model if not already loaded or if different model requested
-            if self.model is None or self.current_model_name != model:
-                self._load_model(model)
-            logger.info(f"Starting Parakeet transcription with model {model}")
-            # Load and preprocess audio
-            audio_array, sample_rate = self._load_audio(audio_path)
-            # Process audio with the processor
-            inputs = self.processor(
-                audio_array,
-                sampling_rate=sample_rate,
-                return_tensors="pt"
-            )
-            inputs.to(self.device, dtype="auto")
-            # Decode the predictions
-            outputs = this.model.generate(**inputs)
-            transcription = self.processor.batch_decode(outputs)
-            logger.info("Parakeet transcription completed successfully")
-            return transcription
-        except Exception as e:
-            self._handle_provider_error(e, "transcription")
-    def _load_model(self, model_name: str):
-        """
-        Load the Parakeet model using Hugging Face Transformers.
-        Args:
-            model_name: Name of the model to load
-        """
-        try:
-            from transformers import AutoProcessor, AutoModelForCTC
-            logger.info(f"Loading Parakeet model: {model_name}")
-            # Map model names to actual model identifiers
-            model_mapping = {
-                "parakeet-ctc-0.6b": "nvidia/parakeet-ctc-0.6b",
-                "default": "nvidia/parakeet-ctc-0.6b"
-            }
-            actual_model_name = model_mapping.get(model_name, model_mapping["default"])
-            # Load processor and model
-            self.processor = AutoProcessor.from_pretrained(actual_model_name)
-            self.model = AutoModelForCTC.from_pretrained(actual_model_name, dtype="auto", device_map=self.device)
-            self.current_model_name = model_name
-            logger.info(f"Parakeet processor {processor}")
-            logger.info(f"Parakeet model {model}")
-            # Set model to evaluation mode
-            self.model.eval()
-            logger.info(f"Parakeet model {model_name} loaded successfully")
-        except ImportError as e:
-            raise SpeechRecognitionException(
-                "transformers library not available. Please install with: pip install transformers[audio]"
-            ) from e
-        except Exception as e:
-            raise SpeechRecognitionException(f"Failed to load Parakeet model {model_name}: {str(e)}") from e
-    def _load_audio(self, audio_path: Path) -> Tuple[torch.Tensor, int]:
-        """
-        Load audio file and return as tensor with sample rate.
-        Args:
-            audio_path: Path to the audio file
-        Returns:
-            Tuple[torch.Tensor, int]: Audio tensor and sample rate
-        """
-        try:
-            # Load audio using librosa
-            audio_array, sample_rate = librosa.load(str(audio_path), sr=None)
-            # Convert to torch tensor
-            audio_tensor = torch.from_numpy(audio_array).float()
-            return audio_tensor, sample_rate
-        except Exception as e:
-            raise SpeechRecognitionException(f"Failed to load audio file {audio_path}: {str(e)}") from e
-    def is_available(self) -> bool:
-        """
-        Check if the Parakeet provider is available.
-        Returns:
-            bool: True if transformers and required libraries are available, False otherwise
-        """
-        try:
-            from transformers import AutoProcessor, AutoModelForCTC
-            import torch
-            import librosa
-            return True
-        except ImportError:
-            logger.warning("Required libraries (transformers, torch, librosa) not available")
-            return False
-    def get_available_models(self) -> list[str]:
-        """
-        Get list of available Parakeet models.
-        Returns:
-            list[str]: List of available model names
-        """
-        return [
-            "parakeet-ctc-0.6b"
-        ]
-    def get_default_model(self) -> str:
-        """
-        Get the default model for this provider.
-        Returns:
-            str: Default model name
-        """
-        return "parakeet-ctc-0.6b"

src/infrastructure/stt/provider_factory.py CHANGED Viewed

@@ -5,7 +5,6 @@ from typing import Dict, Type, Optional
 from ..base.stt_provider_base import STTProviderBase
 from .whisper_provider import WhisperSTTProvider
-from .parakeet_provider import ParakeetSTTProvider
 from ...domain.exceptions import SpeechRecognitionException
 logger = logging.getLogger(__name__)
@@ -15,11 +14,10 @@ class STTProviderFactory:
     """Factory for creating STT provider instances with availability checking and fallback logic."""
     _providers: Dict[str, Type[STTProviderBase]] = {
-        "whisper": WhisperSTTProvider,
-        "parakeet": ParakeetSTTProvider
     }
-    _fallback_order = ["whisper", "parakeet"]
     @classmethod
     def create_provider(cls, provider_name: str) -> STTProviderBase:
@@ -162,7 +160,7 @@ class ASRFactory:
     """Legacy ASRFactory for backward compatibility."""
     @staticmethod
-    def get_model(model_name: str = "parakeet") -> STTProviderBase:
         """
         Get STT provider by model name (legacy interface).
@@ -175,7 +173,6 @@ class ASRFactory:
         # Map legacy model names to provider names
         provider_mapping = {
             "whisper": "whisper",
-            "parakeet": "parakeet",
             "faster-whisper": "whisper"
         }

 from ..base.stt_provider_base import STTProviderBase
 from .whisper_provider import WhisperSTTProvider
 from ...domain.exceptions import SpeechRecognitionException
 logger = logging.getLogger(__name__)
     """Factory for creating STT provider instances with availability checking and fallback logic."""
     _providers: Dict[str, Type[STTProviderBase]] = {
+        "whisper": WhisperSTTProvider
     }
+    _fallback_order = ["whisper"]
     @classmethod
     def create_provider(cls, provider_name: str) -> STTProviderBase:
     """Legacy ASRFactory for backward compatibility."""
     @staticmethod
+    def get_model(model_name: str = "whisper") -> STTProviderBase:
         """
         Get STT provider by model name (legacy interface).
         # Map legacy model names to provider names
         provider_mapping = {
             "whisper": "whisper",
             "faster-whisper": "whisper"
         }