Spaces:

DroolingPanda
/

teachingAssistant

Sleeping

Michael Hu commited on 20 days ago

Commit

3ad3808

1 Parent(s): 22eccbb

fix: remove hard-coded ASR model list and make ASR model optional

- Remove the hard-coded list of ASR models from the configuration service and the Gradio interface, making the ASR model parameter optional.
- Update the processing request DTO and service layer to handle the optional ASR model parameter.
- Update the processing pipeline to use a default ASR model when none is specified.

Files changed (3) hide show

app.py +5 -11
src/application/dtos/processing_request_dto.py +7 -8
src/application/services/audio_processing_service.py +6 -4

app.py CHANGED Viewed

@@ -119,7 +119,6 @@ def get_supported_configurations() -> dict:
         logger.warning("Using fallback configurations - this may indicate a configuration service issue")
         # Return fallback configurations
         fallback_config = {
-            'asr_models': ['parakeet', 'whisper-large'],
             'voices': ['chatterbox'],
             'languages': ['en', 'zh'],
             'audio_formats': ['wav', 'mp3', 'm4a', 'flac', 'ogg'],  # Updated to include all supported formats
@@ -131,7 +130,6 @@ def get_supported_configurations() -> dict:
 def process_audio_pipeline(
     audio_file,
-    asr_model: str,
     target_language: str,
     voice: str,
     speed: float,
@@ -170,7 +168,7 @@ def process_audio_pipeline(
         # Create processing request
         request = ProcessingRequestDto(
             audio=audio_upload,
-            asr_model=asr_model,
             target_language=target_language,
             voice=voice,
             speed=speed,
@@ -225,7 +223,6 @@ def create_interface():
     # Log configuration details for debugging
     logger.info("=== Gradio Interface Configuration ===")
-    logger.info(f"Supported ASR models: {config.get('asr_models', [])}")
     logger.info(f"Supported voices: {config.get('voices', [])}")
     logger.info(f"Supported audio formats: {config.get('audio_formats', [])}")
     logger.info(f"Max file size: {config.get('max_file_size_mb', 0)} MB")
@@ -238,14 +235,16 @@ def create_interface():
         "English": "en"
     }
-    def process_wrapper(audio_file, asr_model_val, target_lang_val, voice_val, speed_val):
         """Wrapper function for processing"""
         # Map display language to code
         target_lang_code = language_options.get(target_lang_val, "zh")
         return process_audio_pipeline(
             audio_file=audio_file,
-            asr_model=asr_model_val,
             target_language=target_lang_code,
             voice=voice_val,
             speed=speed_val,
@@ -265,11 +264,6 @@ def create_interface():
                 # Accept both file extensions and MIME types
                 # This explicitly allows mp3 files to pass Gradio's frontend validation
             ),
-            gr.Dropdown(
-                choices=config['asr_models'],
-                value=config['asr_models'][0] if config['asr_models'] else "parakeet",
-                label="Speech Recognition Model"
-            ),
             gr.Dropdown(
                 choices=list(language_options.keys()),
                 value="Chinese (Mandarin)",

         logger.warning("Using fallback configurations - this may indicate a configuration service issue")
         # Return fallback configurations
         fallback_config = {
             'voices': ['chatterbox'],
             'languages': ['en', 'zh'],
             'audio_formats': ['wav', 'mp3', 'm4a', 'flac', 'ogg'],  # Updated to include all supported formats
 def process_audio_pipeline(
     audio_file,
     target_language: str,
     voice: str,
     speed: float,
         # Create processing request
         request = ProcessingRequestDto(
             audio=audio_upload,
+            asr_model=asr_model,  # This will use the default from config if None
             target_language=target_language,
             voice=voice,
             speed=speed,
     # Log configuration details for debugging
     logger.info("=== Gradio Interface Configuration ===")
     logger.info(f"Supported voices: {config.get('voices', [])}")
     logger.info(f"Supported audio formats: {config.get('audio_formats', [])}")
     logger.info(f"Max file size: {config.get('max_file_size_mb', 0)} MB")
         "English": "en"
     }
+    def process_wrapper(audio_file, target_lang_val, voice_val, speed_val):
         """Wrapper function for processing"""
         # Map display language to code
         target_lang_code = language_options.get(target_lang_val, "zh")
+        # Get default ASR model from configuration
+        default_asr_model = config.get('default_asr_model', 'whisper')
         return process_audio_pipeline(
             audio_file=audio_file,
             target_language=target_lang_code,
             voice=voice_val,
             speed=speed_val,
                 # Accept both file extensions and MIME types
                 # This explicitly allows mp3 files to pass Gradio's frontend validation
             ),
             gr.Dropdown(
                 choices=list(language_options.keys()),
                 value="Chinese (Mandarin)",

src/application/dtos/processing_request_dto.py CHANGED Viewed

@@ -13,9 +13,9 @@ class ProcessingRequestDto:
     the STT -> Translation -> TTS pipeline.
     """
     audio: AudioUploadDto
-    asr_model: str
     target_language: str
     voice: str
     speed: float = 1.0
     source_language: Optional[str] = None
     additional_params: Optional[Dict[str, Any]] = None
@@ -31,13 +31,12 @@ class ProcessingRequestDto:
         if not isinstance(self.audio, AudioUploadDto):
             raise ValueError("Audio must be an AudioUploadDto instance")
-        if not self.asr_model:
-            raise ValueError("ASR model cannot be empty")
-        # Validate ASR model options
-        supported_asr_models = ['whisper-small', 'whisper-medium', 'whisper-large']
-        if self.asr_model not in supported_asr_models:
-            raise ValueError(f"Unsupported ASR model: {self.asr_model}. Supported: {supported_asr_models}")
         if not self.target_language:
             raise ValueError("Target language cannot be empty")

     the STT -> Translation -> TTS pipeline.
     """
     audio: AudioUploadDto
     target_language: str
     voice: str
+    asr_model: Optional[str] = None
     speed: float = 1.0
     source_language: Optional[str] = None
     additional_params: Optional[Dict[str, Any]] = None
         if not isinstance(self.audio, AudioUploadDto):
             raise ValueError("Audio must be an AudioUploadDto instance")
+        # Validate ASR model if provided
+        if self.asr_model:
+            # Validate ASR model options
+            supported_asr_models = ['whisper-small', 'whisper-medium', 'whisper-large']
+            if self.asr_model not in supported_asr_models:
+                raise ValueError(f"Unsupported ASR model: {self.asr_model}. Supported: {supported_asr_models}")
         if not self.target_language:
             raise ValueError("Target language cannot be empty")

src/application/services/audio_processing_service.py CHANGED Viewed

@@ -102,11 +102,14 @@ class AudioProcessingApplicationService:
         Returns:
             ProcessingResultDto: Result of the complete processing pipeline
         """
         # Generate correlation ID and start operation logging
         correlation_id = logger.log_operation_start(
             "audio_processing_pipeline",
             extra={
-                'asr_model': request.asr_model,
                 'target_language': request.target_language,
                 'voice': request.voice,
                 'file_name': request.audio.filename,
@@ -133,7 +136,7 @@ class AudioProcessingApplicationService:
                 # Step 2: Speech-to-Text with retry and fallback
                 original_text = self._perform_speech_recognition_with_recovery(
                     audio_content,
-                    request.asr_model,
                     correlation_id
                 )
@@ -168,7 +171,7 @@ class AudioProcessingApplicationService:
                     processing_time=processing_time,
                     metadata={
                         'correlation_id': correlation_id,
-                        'asr_model': request.asr_model,
                         'target_language': request.target_language,
                         'voice': request.voice,
                         'speed': request.speed,
@@ -634,7 +637,6 @@ class AudioProcessingApplicationService:
             Dict[str, Any]: Supported configurations
         """
         return {
-            'asr_models': ['whisper-large'],
             'voices': ['chatterbox'],
             'languages': ['en', 'zh'],
             'audio_formats': self._config.get_processing_config()['supported_audio_formats'],

         Returns:
             ProcessingResultDto: Result of the complete processing pipeline
         """
+        # Use default ASR model from configuration if none provided
+        asr_model = request.asr_model or self._config.get_stt_config()['default_model']
         # Generate correlation ID and start operation logging
         correlation_id = logger.log_operation_start(
             "audio_processing_pipeline",
             extra={
+                'asr_model': asr_model,
                 'target_language': request.target_language,
                 'voice': request.voice,
                 'file_name': request.audio.filename,
                 # Step 2: Speech-to-Text with retry and fallback
                 original_text = self._perform_speech_recognition_with_recovery(
                     audio_content,
+                    asr_model,
                     correlation_id
                 )
                     processing_time=processing_time,
                     metadata={
                         'correlation_id': correlation_id,
+                        'asr_model': asr_model,
                         'target_language': request.target_language,
                         'voice': request.voice,
                         'speed': request.speed,
             Dict[str, Any]: Supported configurations
         """
         return {
             'voices': ['chatterbox'],
             'languages': ['en', 'zh'],
             'audio_formats': self._config.get_processing_config()['supported_audio_formats'],