Spaces:
Sleeping
Sleeping
Michael Hu
commited on
Commit
·
3ad3808
1
Parent(s):
22eccbb
fix: remove hard-coded ASR model list and make ASR model optional
Browse files- Remove the hard-coded list of ASR models from the configuration service and the Gradio interface, making the ASR model parameter optional.
- Update the processing request DTO and service layer to handle the optional ASR model parameter.
- Update the processing pipeline to use a default ASR model when none is specified.
app.py
CHANGED
|
@@ -119,7 +119,6 @@ def get_supported_configurations() -> dict:
|
|
| 119 |
logger.warning("Using fallback configurations - this may indicate a configuration service issue")
|
| 120 |
# Return fallback configurations
|
| 121 |
fallback_config = {
|
| 122 |
-
'asr_models': ['parakeet', 'whisper-large'],
|
| 123 |
'voices': ['chatterbox'],
|
| 124 |
'languages': ['en', 'zh'],
|
| 125 |
'audio_formats': ['wav', 'mp3', 'm4a', 'flac', 'ogg'], # Updated to include all supported formats
|
|
@@ -131,7 +130,6 @@ def get_supported_configurations() -> dict:
|
|
| 131 |
|
| 132 |
def process_audio_pipeline(
|
| 133 |
audio_file,
|
| 134 |
-
asr_model: str,
|
| 135 |
target_language: str,
|
| 136 |
voice: str,
|
| 137 |
speed: float,
|
|
@@ -170,7 +168,7 @@ def process_audio_pipeline(
|
|
| 170 |
# Create processing request
|
| 171 |
request = ProcessingRequestDto(
|
| 172 |
audio=audio_upload,
|
| 173 |
-
asr_model=asr_model,
|
| 174 |
target_language=target_language,
|
| 175 |
voice=voice,
|
| 176 |
speed=speed,
|
|
@@ -225,7 +223,6 @@ def create_interface():
|
|
| 225 |
|
| 226 |
# Log configuration details for debugging
|
| 227 |
logger.info("=== Gradio Interface Configuration ===")
|
| 228 |
-
logger.info(f"Supported ASR models: {config.get('asr_models', [])}")
|
| 229 |
logger.info(f"Supported voices: {config.get('voices', [])}")
|
| 230 |
logger.info(f"Supported audio formats: {config.get('audio_formats', [])}")
|
| 231 |
logger.info(f"Max file size: {config.get('max_file_size_mb', 0)} MB")
|
|
@@ -238,14 +235,16 @@ def create_interface():
|
|
| 238 |
"English": "en"
|
| 239 |
}
|
| 240 |
|
| 241 |
-
def process_wrapper(audio_file,
|
| 242 |
"""Wrapper function for processing"""
|
| 243 |
# Map display language to code
|
| 244 |
target_lang_code = language_options.get(target_lang_val, "zh")
|
| 245 |
|
|
|
|
|
|
|
|
|
|
| 246 |
return process_audio_pipeline(
|
| 247 |
audio_file=audio_file,
|
| 248 |
-
asr_model=asr_model_val,
|
| 249 |
target_language=target_lang_code,
|
| 250 |
voice=voice_val,
|
| 251 |
speed=speed_val,
|
|
@@ -265,11 +264,6 @@ def create_interface():
|
|
| 265 |
# Accept both file extensions and MIME types
|
| 266 |
# This explicitly allows mp3 files to pass Gradio's frontend validation
|
| 267 |
),
|
| 268 |
-
gr.Dropdown(
|
| 269 |
-
choices=config['asr_models'],
|
| 270 |
-
value=config['asr_models'][0] if config['asr_models'] else "parakeet",
|
| 271 |
-
label="Speech Recognition Model"
|
| 272 |
-
),
|
| 273 |
gr.Dropdown(
|
| 274 |
choices=list(language_options.keys()),
|
| 275 |
value="Chinese (Mandarin)",
|
|
|
|
| 119 |
logger.warning("Using fallback configurations - this may indicate a configuration service issue")
|
| 120 |
# Return fallback configurations
|
| 121 |
fallback_config = {
|
|
|
|
| 122 |
'voices': ['chatterbox'],
|
| 123 |
'languages': ['en', 'zh'],
|
| 124 |
'audio_formats': ['wav', 'mp3', 'm4a', 'flac', 'ogg'], # Updated to include all supported formats
|
|
|
|
| 130 |
|
| 131 |
def process_audio_pipeline(
|
| 132 |
audio_file,
|
|
|
|
| 133 |
target_language: str,
|
| 134 |
voice: str,
|
| 135 |
speed: float,
|
|
|
|
| 168 |
# Create processing request
|
| 169 |
request = ProcessingRequestDto(
|
| 170 |
audio=audio_upload,
|
| 171 |
+
asr_model=asr_model, # This will use the default from config if None
|
| 172 |
target_language=target_language,
|
| 173 |
voice=voice,
|
| 174 |
speed=speed,
|
|
|
|
| 223 |
|
| 224 |
# Log configuration details for debugging
|
| 225 |
logger.info("=== Gradio Interface Configuration ===")
|
|
|
|
| 226 |
logger.info(f"Supported voices: {config.get('voices', [])}")
|
| 227 |
logger.info(f"Supported audio formats: {config.get('audio_formats', [])}")
|
| 228 |
logger.info(f"Max file size: {config.get('max_file_size_mb', 0)} MB")
|
|
|
|
| 235 |
"English": "en"
|
| 236 |
}
|
| 237 |
|
| 238 |
+
def process_wrapper(audio_file, target_lang_val, voice_val, speed_val):
|
| 239 |
"""Wrapper function for processing"""
|
| 240 |
# Map display language to code
|
| 241 |
target_lang_code = language_options.get(target_lang_val, "zh")
|
| 242 |
|
| 243 |
+
# Get default ASR model from configuration
|
| 244 |
+
default_asr_model = config.get('default_asr_model', 'whisper')
|
| 245 |
+
|
| 246 |
return process_audio_pipeline(
|
| 247 |
audio_file=audio_file,
|
|
|
|
| 248 |
target_language=target_lang_code,
|
| 249 |
voice=voice_val,
|
| 250 |
speed=speed_val,
|
|
|
|
| 264 |
# Accept both file extensions and MIME types
|
| 265 |
# This explicitly allows mp3 files to pass Gradio's frontend validation
|
| 266 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
gr.Dropdown(
|
| 268 |
choices=list(language_options.keys()),
|
| 269 |
value="Chinese (Mandarin)",
|
src/application/dtos/processing_request_dto.py
CHANGED
|
@@ -13,9 +13,9 @@ class ProcessingRequestDto:
|
|
| 13 |
the STT -> Translation -> TTS pipeline.
|
| 14 |
"""
|
| 15 |
audio: AudioUploadDto
|
| 16 |
-
asr_model: str
|
| 17 |
target_language: str
|
| 18 |
voice: str
|
|
|
|
| 19 |
speed: float = 1.0
|
| 20 |
source_language: Optional[str] = None
|
| 21 |
additional_params: Optional[Dict[str, Any]] = None
|
|
@@ -31,13 +31,12 @@ class ProcessingRequestDto:
|
|
| 31 |
if not isinstance(self.audio, AudioUploadDto):
|
| 32 |
raise ValueError("Audio must be an AudioUploadDto instance")
|
| 33 |
|
| 34 |
-
if
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
raise ValueError(f"Unsupported ASR model: {self.asr_model}. Supported: {supported_asr_models}")
|
| 41 |
|
| 42 |
if not self.target_language:
|
| 43 |
raise ValueError("Target language cannot be empty")
|
|
|
|
| 13 |
the STT -> Translation -> TTS pipeline.
|
| 14 |
"""
|
| 15 |
audio: AudioUploadDto
|
|
|
|
| 16 |
target_language: str
|
| 17 |
voice: str
|
| 18 |
+
asr_model: Optional[str] = None
|
| 19 |
speed: float = 1.0
|
| 20 |
source_language: Optional[str] = None
|
| 21 |
additional_params: Optional[Dict[str, Any]] = None
|
|
|
|
| 31 |
if not isinstance(self.audio, AudioUploadDto):
|
| 32 |
raise ValueError("Audio must be an AudioUploadDto instance")
|
| 33 |
|
| 34 |
+
# Validate ASR model if provided
|
| 35 |
+
if self.asr_model:
|
| 36 |
+
# Validate ASR model options
|
| 37 |
+
supported_asr_models = ['whisper-small', 'whisper-medium', 'whisper-large']
|
| 38 |
+
if self.asr_model not in supported_asr_models:
|
| 39 |
+
raise ValueError(f"Unsupported ASR model: {self.asr_model}. Supported: {supported_asr_models}")
|
|
|
|
| 40 |
|
| 41 |
if not self.target_language:
|
| 42 |
raise ValueError("Target language cannot be empty")
|
src/application/services/audio_processing_service.py
CHANGED
|
@@ -102,11 +102,14 @@ class AudioProcessingApplicationService:
|
|
| 102 |
Returns:
|
| 103 |
ProcessingResultDto: Result of the complete processing pipeline
|
| 104 |
"""
|
|
|
|
|
|
|
|
|
|
| 105 |
# Generate correlation ID and start operation logging
|
| 106 |
correlation_id = logger.log_operation_start(
|
| 107 |
"audio_processing_pipeline",
|
| 108 |
extra={
|
| 109 |
-
'asr_model':
|
| 110 |
'target_language': request.target_language,
|
| 111 |
'voice': request.voice,
|
| 112 |
'file_name': request.audio.filename,
|
|
@@ -133,7 +136,7 @@ class AudioProcessingApplicationService:
|
|
| 133 |
# Step 2: Speech-to-Text with retry and fallback
|
| 134 |
original_text = self._perform_speech_recognition_with_recovery(
|
| 135 |
audio_content,
|
| 136 |
-
|
| 137 |
correlation_id
|
| 138 |
)
|
| 139 |
|
|
@@ -168,7 +171,7 @@ class AudioProcessingApplicationService:
|
|
| 168 |
processing_time=processing_time,
|
| 169 |
metadata={
|
| 170 |
'correlation_id': correlation_id,
|
| 171 |
-
'asr_model':
|
| 172 |
'target_language': request.target_language,
|
| 173 |
'voice': request.voice,
|
| 174 |
'speed': request.speed,
|
|
@@ -634,7 +637,6 @@ class AudioProcessingApplicationService:
|
|
| 634 |
Dict[str, Any]: Supported configurations
|
| 635 |
"""
|
| 636 |
return {
|
| 637 |
-
'asr_models': ['whisper-large'],
|
| 638 |
'voices': ['chatterbox'],
|
| 639 |
'languages': ['en', 'zh'],
|
| 640 |
'audio_formats': self._config.get_processing_config()['supported_audio_formats'],
|
|
|
|
| 102 |
Returns:
|
| 103 |
ProcessingResultDto: Result of the complete processing pipeline
|
| 104 |
"""
|
| 105 |
+
# Use default ASR model from configuration if none provided
|
| 106 |
+
asr_model = request.asr_model or self._config.get_stt_config()['default_model']
|
| 107 |
+
|
| 108 |
# Generate correlation ID and start operation logging
|
| 109 |
correlation_id = logger.log_operation_start(
|
| 110 |
"audio_processing_pipeline",
|
| 111 |
extra={
|
| 112 |
+
'asr_model': asr_model,
|
| 113 |
'target_language': request.target_language,
|
| 114 |
'voice': request.voice,
|
| 115 |
'file_name': request.audio.filename,
|
|
|
|
| 136 |
# Step 2: Speech-to-Text with retry and fallback
|
| 137 |
original_text = self._perform_speech_recognition_with_recovery(
|
| 138 |
audio_content,
|
| 139 |
+
asr_model,
|
| 140 |
correlation_id
|
| 141 |
)
|
| 142 |
|
|
|
|
| 171 |
processing_time=processing_time,
|
| 172 |
metadata={
|
| 173 |
'correlation_id': correlation_id,
|
| 174 |
+
'asr_model': asr_model,
|
| 175 |
'target_language': request.target_language,
|
| 176 |
'voice': request.voice,
|
| 177 |
'speed': request.speed,
|
|
|
|
| 637 |
Dict[str, Any]: Supported configurations
|
| 638 |
"""
|
| 639 |
return {
|
|
|
|
| 640 |
'voices': ['chatterbox'],
|
| 641 |
'languages': ['en', 'zh'],
|
| 642 |
'audio_formats': self._config.get_processing_config()['supported_audio_formats'],
|