Michael Hu commited on
Commit
3ad3808
·
1 Parent(s): 22eccbb

fix: remove hard-coded ASR model list and make ASR model optional

Browse files

- Remove the hard-coded list of ASR models from the configuration service and the Gradio interface, making the ASR model parameter optional.
- Update the processing request DTO and service layer to handle the optional ASR model parameter.
- Update the processing pipeline to use a default ASR model when none is specified.

app.py CHANGED
@@ -119,7 +119,6 @@ def get_supported_configurations() -> dict:
119
  logger.warning("Using fallback configurations - this may indicate a configuration service issue")
120
  # Return fallback configurations
121
  fallback_config = {
122
- 'asr_models': ['parakeet', 'whisper-large'],
123
  'voices': ['chatterbox'],
124
  'languages': ['en', 'zh'],
125
  'audio_formats': ['wav', 'mp3', 'm4a', 'flac', 'ogg'], # Updated to include all supported formats
@@ -131,7 +130,6 @@ def get_supported_configurations() -> dict:
131
 
132
  def process_audio_pipeline(
133
  audio_file,
134
- asr_model: str,
135
  target_language: str,
136
  voice: str,
137
  speed: float,
@@ -170,7 +168,7 @@ def process_audio_pipeline(
170
  # Create processing request
171
  request = ProcessingRequestDto(
172
  audio=audio_upload,
173
- asr_model=asr_model,
174
  target_language=target_language,
175
  voice=voice,
176
  speed=speed,
@@ -225,7 +223,6 @@ def create_interface():
225
 
226
  # Log configuration details for debugging
227
  logger.info("=== Gradio Interface Configuration ===")
228
- logger.info(f"Supported ASR models: {config.get('asr_models', [])}")
229
  logger.info(f"Supported voices: {config.get('voices', [])}")
230
  logger.info(f"Supported audio formats: {config.get('audio_formats', [])}")
231
  logger.info(f"Max file size: {config.get('max_file_size_mb', 0)} MB")
@@ -238,14 +235,16 @@ def create_interface():
238
  "English": "en"
239
  }
240
 
241
- def process_wrapper(audio_file, asr_model_val, target_lang_val, voice_val, speed_val):
242
  """Wrapper function for processing"""
243
  # Map display language to code
244
  target_lang_code = language_options.get(target_lang_val, "zh")
245
 
 
 
 
246
  return process_audio_pipeline(
247
  audio_file=audio_file,
248
- asr_model=asr_model_val,
249
  target_language=target_lang_code,
250
  voice=voice_val,
251
  speed=speed_val,
@@ -265,11 +264,6 @@ def create_interface():
265
  # Accept both file extensions and MIME types
266
  # This explicitly allows mp3 files to pass Gradio's frontend validation
267
  ),
268
- gr.Dropdown(
269
- choices=config['asr_models'],
270
- value=config['asr_models'][0] if config['asr_models'] else "parakeet",
271
- label="Speech Recognition Model"
272
- ),
273
  gr.Dropdown(
274
  choices=list(language_options.keys()),
275
  value="Chinese (Mandarin)",
 
119
  logger.warning("Using fallback configurations - this may indicate a configuration service issue")
120
  # Return fallback configurations
121
  fallback_config = {
 
122
  'voices': ['chatterbox'],
123
  'languages': ['en', 'zh'],
124
  'audio_formats': ['wav', 'mp3', 'm4a', 'flac', 'ogg'], # Updated to include all supported formats
 
130
 
131
  def process_audio_pipeline(
132
  audio_file,
 
133
  target_language: str,
134
  voice: str,
135
  speed: float,
 
168
  # Create processing request
169
  request = ProcessingRequestDto(
170
  audio=audio_upload,
171
+ asr_model=asr_model, # This will use the default from config if None
172
  target_language=target_language,
173
  voice=voice,
174
  speed=speed,
 
223
 
224
  # Log configuration details for debugging
225
  logger.info("=== Gradio Interface Configuration ===")
 
226
  logger.info(f"Supported voices: {config.get('voices', [])}")
227
  logger.info(f"Supported audio formats: {config.get('audio_formats', [])}")
228
  logger.info(f"Max file size: {config.get('max_file_size_mb', 0)} MB")
 
235
  "English": "en"
236
  }
237
 
238
+ def process_wrapper(audio_file, target_lang_val, voice_val, speed_val):
239
  """Wrapper function for processing"""
240
  # Map display language to code
241
  target_lang_code = language_options.get(target_lang_val, "zh")
242
 
243
+ # Get default ASR model from configuration
244
+ default_asr_model = config.get('default_asr_model', 'whisper')
245
+
246
  return process_audio_pipeline(
247
  audio_file=audio_file,
 
248
  target_language=target_lang_code,
249
  voice=voice_val,
250
  speed=speed_val,
 
264
  # Accept both file extensions and MIME types
265
  # This explicitly allows mp3 files to pass Gradio's frontend validation
266
  ),
 
 
 
 
 
267
  gr.Dropdown(
268
  choices=list(language_options.keys()),
269
  value="Chinese (Mandarin)",
src/application/dtos/processing_request_dto.py CHANGED
@@ -13,9 +13,9 @@ class ProcessingRequestDto:
13
  the STT -> Translation -> TTS pipeline.
14
  """
15
  audio: AudioUploadDto
16
- asr_model: str
17
  target_language: str
18
  voice: str
 
19
  speed: float = 1.0
20
  source_language: Optional[str] = None
21
  additional_params: Optional[Dict[str, Any]] = None
@@ -31,13 +31,12 @@ class ProcessingRequestDto:
31
  if not isinstance(self.audio, AudioUploadDto):
32
  raise ValueError("Audio must be an AudioUploadDto instance")
33
 
34
- if not self.asr_model:
35
- raise ValueError("ASR model cannot be empty")
36
-
37
- # Validate ASR model options
38
- supported_asr_models = ['whisper-small', 'whisper-medium', 'whisper-large']
39
- if self.asr_model not in supported_asr_models:
40
- raise ValueError(f"Unsupported ASR model: {self.asr_model}. Supported: {supported_asr_models}")
41
 
42
  if not self.target_language:
43
  raise ValueError("Target language cannot be empty")
 
13
  the STT -> Translation -> TTS pipeline.
14
  """
15
  audio: AudioUploadDto
 
16
  target_language: str
17
  voice: str
18
+ asr_model: Optional[str] = None
19
  speed: float = 1.0
20
  source_language: Optional[str] = None
21
  additional_params: Optional[Dict[str, Any]] = None
 
31
  if not isinstance(self.audio, AudioUploadDto):
32
  raise ValueError("Audio must be an AudioUploadDto instance")
33
 
34
+ # Validate ASR model if provided
35
+ if self.asr_model:
36
+ # Validate ASR model options
37
+ supported_asr_models = ['whisper-small', 'whisper-medium', 'whisper-large']
38
+ if self.asr_model not in supported_asr_models:
39
+ raise ValueError(f"Unsupported ASR model: {self.asr_model}. Supported: {supported_asr_models}")
 
40
 
41
  if not self.target_language:
42
  raise ValueError("Target language cannot be empty")
src/application/services/audio_processing_service.py CHANGED
@@ -102,11 +102,14 @@ class AudioProcessingApplicationService:
102
  Returns:
103
  ProcessingResultDto: Result of the complete processing pipeline
104
  """
 
 
 
105
  # Generate correlation ID and start operation logging
106
  correlation_id = logger.log_operation_start(
107
  "audio_processing_pipeline",
108
  extra={
109
- 'asr_model': request.asr_model,
110
  'target_language': request.target_language,
111
  'voice': request.voice,
112
  'file_name': request.audio.filename,
@@ -133,7 +136,7 @@ class AudioProcessingApplicationService:
133
  # Step 2: Speech-to-Text with retry and fallback
134
  original_text = self._perform_speech_recognition_with_recovery(
135
  audio_content,
136
- request.asr_model,
137
  correlation_id
138
  )
139
 
@@ -168,7 +171,7 @@ class AudioProcessingApplicationService:
168
  processing_time=processing_time,
169
  metadata={
170
  'correlation_id': correlation_id,
171
- 'asr_model': request.asr_model,
172
  'target_language': request.target_language,
173
  'voice': request.voice,
174
  'speed': request.speed,
@@ -634,7 +637,6 @@ class AudioProcessingApplicationService:
634
  Dict[str, Any]: Supported configurations
635
  """
636
  return {
637
- 'asr_models': ['whisper-large'],
638
  'voices': ['chatterbox'],
639
  'languages': ['en', 'zh'],
640
  'audio_formats': self._config.get_processing_config()['supported_audio_formats'],
 
102
  Returns:
103
  ProcessingResultDto: Result of the complete processing pipeline
104
  """
105
+ # Use default ASR model from configuration if none provided
106
+ asr_model = request.asr_model or self._config.get_stt_config()['default_model']
107
+
108
  # Generate correlation ID and start operation logging
109
  correlation_id = logger.log_operation_start(
110
  "audio_processing_pipeline",
111
  extra={
112
+ 'asr_model': asr_model,
113
  'target_language': request.target_language,
114
  'voice': request.voice,
115
  'file_name': request.audio.filename,
 
136
  # Step 2: Speech-to-Text with retry and fallback
137
  original_text = self._perform_speech_recognition_with_recovery(
138
  audio_content,
139
+ asr_model,
140
  correlation_id
141
  )
142
 
 
171
  processing_time=processing_time,
172
  metadata={
173
  'correlation_id': correlation_id,
174
+ 'asr_model': asr_model,
175
  'target_language': request.target_language,
176
  'voice': request.voice,
177
  'speed': request.speed,
 
637
  Dict[str, Any]: Supported configurations
638
  """
639
  return {
 
640
  'voices': ['chatterbox'],
641
  'languages': ['en', 'zh'],
642
  'audio_formats': self._config.get_processing_config()['supported_audio_formats'],