Spaces:
Sleeping
Sleeping
| """Concrete implementation of audio processing service.""" | |
| import time | |
| from typing import TYPE_CHECKING | |
| from ..interfaces.audio_processing import IAudioProcessingService | |
| from ..interfaces.speech_recognition import ISpeechRecognitionService | |
| from ..interfaces.translation import ITranslationService | |
| from ..interfaces.speech_synthesis import ISpeechSynthesisService | |
| from ..models.processing_result import ProcessingResult | |
| from ..models.translation_request import TranslationRequest | |
| from ..models.speech_synthesis_request import SpeechSynthesisRequest | |
| from ..exceptions import ( | |
| AudioProcessingException, | |
| SpeechRecognitionException, | |
| TranslationFailedException, | |
| SpeechSynthesisException | |
| ) | |
| if TYPE_CHECKING: | |
| from ..models.audio_content import AudioContent | |
| from ..models.voice_settings import VoiceSettings | |
| class AudioProcessingService(IAudioProcessingService): | |
| """Concrete implementation of audio processing pipeline orchestration.""" | |
| def __init__( | |
| self, | |
| speech_recognition_service: ISpeechRecognitionService, | |
| translation_service: ITranslationService, | |
| speech_synthesis_service: ISpeechSynthesisService | |
| ): | |
| """ | |
| Initialize the audio processing service with injected dependencies. | |
| Args: | |
| speech_recognition_service: Service for speech-to-text conversion | |
| translation_service: Service for text translation | |
| speech_synthesis_service: Service for text-to-speech synthesis | |
| """ | |
| self._speech_recognition_service = speech_recognition_service | |
| self._translation_service = translation_service | |
| self._speech_synthesis_service = speech_synthesis_service | |
| def process_audio_pipeline( | |
| self, | |
| audio: 'AudioContent', | |
| target_language: str, | |
| voice_settings: 'VoiceSettings' | |
| ) -> 'ProcessingResult': | |
| """ | |
| Process audio through the complete pipeline: STT -> Translation -> TTS. | |
| Args: | |
| audio: The input audio content | |
| target_language: The target language for translation | |
| voice_settings: Voice settings for TTS synthesis | |
| Returns: | |
| ProcessingResult: The result of the complete processing pipeline | |
| Raises: | |
| AudioProcessingException: If any step in the pipeline fails | |
| """ | |
| start_time = time.time() | |
| try: | |
| # Validate inputs | |
| self._validate_pipeline_inputs(audio, target_language, voice_settings) | |
| # Step 1: Speech Recognition (STT) | |
| original_text = self._perform_speech_recognition(audio) | |
| # Step 2: Translation | |
| translated_text = self._perform_translation(original_text, target_language) | |
| # Step 3: Speech Synthesis (TTS) | |
| audio_output = self._perform_speech_synthesis(translated_text, voice_settings) | |
| # Calculate processing time | |
| processing_time = time.time() - start_time | |
| # Create successful result | |
| return ProcessingResult.success_result( | |
| original_text=original_text, | |
| translated_text=translated_text, | |
| audio_output=audio_output, | |
| processing_time=processing_time | |
| ) | |
| except (SpeechRecognitionException, TranslationFailedException, SpeechSynthesisException) as e: | |
| # Handle domain-specific exceptions | |
| processing_time = time.time() - start_time | |
| return ProcessingResult.failure_result( | |
| error_message=str(e), | |
| processing_time=processing_time | |
| ) | |
| except Exception as e: | |
| # Handle unexpected exceptions | |
| processing_time = time.time() - start_time | |
| error_message = f"Unexpected error in audio processing pipeline: {str(e)}" | |
| return ProcessingResult.failure_result( | |
| error_message=error_message, | |
| processing_time=processing_time | |
| ) | |
| def _validate_pipeline_inputs( | |
| self, | |
| audio: 'AudioContent', | |
| target_language: str, | |
| voice_settings: 'VoiceSettings' | |
| ) -> None: | |
| """ | |
| Validate inputs for the audio processing pipeline. | |
| Args: | |
| audio: The input audio content | |
| target_language: The target language for translation | |
| voice_settings: Voice settings for TTS synthesis | |
| Raises: | |
| AudioProcessingException: If validation fails | |
| """ | |
| if audio is None: | |
| raise AudioProcessingException("Audio content cannot be None") | |
| if not target_language or not target_language.strip(): | |
| raise AudioProcessingException("Target language cannot be empty") | |
| if voice_settings is None: | |
| raise AudioProcessingException("Voice settings cannot be None") | |
| # Validate that voice settings language matches target language | |
| if voice_settings.language != target_language: | |
| raise AudioProcessingException( | |
| f"Voice settings language ({voice_settings.language}) must match " | |
| f"target language ({target_language})" | |
| ) | |
| # Validate audio duration for processing limits | |
| if audio.duration > 300: # 5 minutes limit | |
| raise AudioProcessingException( | |
| f"Audio duration ({audio.duration:.1f}s) exceeds maximum allowed duration (300s)" | |
| ) | |
| # Validate audio format is supported | |
| if not audio.is_valid_format: | |
| raise AudioProcessingException(f"Unsupported audio format: {audio.format}") | |
| def _perform_speech_recognition(self, audio: 'AudioContent') -> 'TextContent': | |
| """ | |
| Perform speech recognition on the input audio. | |
| Args: | |
| audio: The input audio content | |
| Returns: | |
| TextContent: The transcribed text | |
| Raises: | |
| SpeechRecognitionException: If transcription fails | |
| """ | |
| try: | |
| # Use a default STT model - this could be configurable in the future | |
| model = "whisper-base" # Default model | |
| return self._speech_recognition_service.transcribe(audio, model) | |
| except Exception as e: | |
| raise SpeechRecognitionException(f"Speech recognition failed: {str(e)}") | |
| def _perform_translation(self, text: 'TextContent', target_language: str) -> 'TextContent': | |
| """ | |
| Perform translation of the transcribed text. | |
| Args: | |
| text: The text to translate | |
| target_language: The target language for translation | |
| Returns: | |
| TextContent: The translated text | |
| Raises: | |
| TranslationFailedException: If translation fails | |
| """ | |
| try: | |
| # Check if translation is needed | |
| if text.language == target_language: | |
| # No translation needed, return original text | |
| return text | |
| # Create translation request | |
| translation_request = TranslationRequest( | |
| source_text=text, | |
| target_language=target_language | |
| ) | |
| return self._translation_service.translate(translation_request) | |
| except Exception as e: | |
| raise TranslationFailedException(f"Translation failed: {str(e)}") | |
| def _perform_speech_synthesis( | |
| self, | |
| text: 'TextContent', | |
| voice_settings: 'VoiceSettings' | |
| ) -> 'AudioContent': | |
| """ | |
| Perform speech synthesis on the translated text. | |
| Args: | |
| text: The text to synthesize | |
| voice_settings: Voice settings for synthesis | |
| Returns: | |
| AudioContent: The synthesized audio | |
| Raises: | |
| SpeechSynthesisException: If synthesis fails | |
| """ | |
| try: | |
| # Create speech synthesis request | |
| synthesis_request = SpeechSynthesisRequest( | |
| text_content=text, | |
| voice_settings=voice_settings | |
| ) | |
| return self._speech_synthesis_service.synthesize(synthesis_request) | |
| except Exception as e: | |
| raise SpeechSynthesisException(f"Speech synthesis failed: {str(e)}") |