""" Automatic Speech Recognition (ASR) Engine for Carsa AI A comprehensive ASR engine that converts speech audio to text using state-of-the-art speech recognition models. Optimized for English speech recognition with support for various audio formats. Features: - High-quality speech-to-text conversion - Support for WAV, MP3, and other audio formats - Automatic audio preprocessing - GPU acceleration when available - Robust error handling Author: Carsa AI Team Version: 1.0.0 """ import torch import logging import io import tempfile import os import soundfile as sf from transformers import pipeline import librosa import numpy as np # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class ASREngine: """ A production-ready Automatic Speech Recognition engine. This class provides speech-to-text capabilities using Hugging Face's transformers library with Whisper or similar ASR models. """ def __init__(self, model_name="openai/whisper-small"): """ Initialize the ASR Engine. Args: model_name (str): The ASR model to use. Default: "openai/whisper-base" Options: "openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium" Raises: Exception: If model loading fails """ try: self.device = 0 if torch.cuda.is_available() else -1 device_name = "GPU" if torch.cuda.is_available() else "CPU" logger.info(f"ASR Engine using device: {device_name}") self.model_name = model_name self.sample_rate = 16000 # Whisper expects 16kHz audio logger.info(f"Loading ASR model: {model_name}") # Load the ASR pipeline self.transcriber = pipeline( "automatic-speech-recognition", model=model_name, device=self.device, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, return_timestamps=False # Set to True if you want word-level timestamps ) logger.info("✅ ASR Engine initialized successfully!") except Exception as e: logger.error(f"❌ Failed to initialize ASR Engine: {e}") raise Exception(f"ASR Engine initialization failed: {str(e)}") def _preprocess_audio(self, audio_bytes): """ Preprocess audio data for speech recognition. Args: audio_bytes (bytes): Raw audio data Returns: np.ndarray: Preprocessed audio array Raises: Exception: If audio preprocessing fails """ try: # First try using BytesIO (faster method) try: audio_file = io.BytesIO(audio_bytes) audio_data, sr = sf.read(audio_file) # Convert to mono if stereo if len(audio_data.shape) > 1: audio_data = np.mean(audio_data, axis=1) # Resample if needed if sr != self.sample_rate: import librosa audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=self.sample_rate) # Normalize and enhance audio if len(audio_data) > 0: # Remove DC offset audio_data = audio_data - np.mean(audio_data) # Apply normalization max_val = np.max(np.abs(audio_data)) if max_val > 0: audio_data = audio_data / max_val # Apply gentle noise gate (remove very quiet sections) noise_floor = 0.01 # 1% threshold audio_data = np.where(np.abs(audio_data) < noise_floor, 0, audio_data) logger.info(f"Audio preprocessed (BytesIO): {len(audio_data)} samples at {self.sample_rate}Hz") return audio_data except Exception as e1: logger.warning(f"BytesIO method failed: {e1}, trying temporary file method...") # Fallback to temporary file method with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file: temp_file.write(audio_bytes) temp_path = temp_file.name try: # Load audio using librosa import librosa audio_array, sr = librosa.load(temp_path, sr=self.sample_rate, mono=True) # Normalize audio if len(audio_array) > 0: max_val = np.max(np.abs(audio_array)) if max_val > 0: audio_array = audio_array / max_val logger.info(f"Audio preprocessed (file): {len(audio_array)} samples at {sr}Hz") return audio_array finally: # Clean up temporary file if os.path.exists(temp_path): os.unlink(temp_path) except Exception as e: logger.error(f"❌ Audio preprocessing failed: {e}") raise Exception(f"Failed to preprocess audio: {str(e)}") def transcribe(self, audio_bytes): """ Transcribe audio bytes to text. Args: audio_bytes (bytes): Audio data in bytes format Returns: str: Transcribed text Raises: ValueError: If audio data is invalid RuntimeError: If transcription fails """ if not audio_bytes: raise ValueError("Audio data cannot be empty") try: logger.info("Starting speech transcription...") # Preprocess audio audio_array = self._preprocess_audio(audio_bytes) if len(audio_array) == 0: logger.warning("Empty audio array after preprocessing") return "" # Perform transcription with compatible settings result = self.transcriber(audio_array) # Extract text from result if isinstance(result, dict): transcribed_text = result.get('text', '').strip() elif isinstance(result, str): transcribed_text = result.strip() else: transcribed_text = str(result).strip() # Clean up common transcription artifacts transcribed_text = self._clean_transcription(transcribed_text) logger.info(f"Transcription completed: '{transcribed_text[:100]}{'...' if len(transcribed_text) > 100 else ''}'") return transcribed_text except Exception as e: logger.error(f"Transcription failed: {e}") raise RuntimeError(f"Speech transcription failed: {str(e)}") def _clean_transcription(self, text): """ Clean up common transcription artifacts and repetitive patterns. Args: text (str): Raw transcription text Returns: str: Cleaned transcription text """ if not text: return "" import re # Remove excessive repetition (more than 3 consecutive identical words/chars) # Pattern: word-word-word-word... -> word text = re.sub(r'\b(\w+)(?:-\1){3,}\b', r'\1', text) # Remove excessive repetition of single characters # Pattern: I-I-I-I... -> I text = re.sub(r'\b(\w)(?:-\1){2,}\b', r'\1', text) # Remove excessive repetition of words # Pattern: yeah yeah yeah yeah... -> yeah text = re.sub(r'\b(\w+)(?:\s+\1){3,}\b', r'\1', text, flags=re.IGNORECASE) # Clean up extra spaces and punctuation text = re.sub(r'\s+', ' ', text) text = text.strip() # If result is too short or just repetitive noise, return empty if len(text) < 3 or len(set(text.lower().split())) == 1: return "" return text def transcribe_file(self, file_path): """ Transcribe audio from a file. Args: file_path (str): Path to the audio file Returns: str: Transcribed text Raises: FileNotFoundError: If file doesn't exist RuntimeError: If transcription fails """ if not os.path.exists(file_path): raise FileNotFoundError(f"Audio file not found: {file_path}") try: with open(file_path, 'rb') as f: audio_bytes = f.read() return self.transcribe(audio_bytes) except Exception as e: logger.error(f"File transcription failed: {e}") raise RuntimeError(f"File transcription failed: {str(e)}") def get_supported_formats(self): """ Get list of supported audio formats. Returns: list: List of supported audio file extensions """ return ['.wav', '.mp3', '.m4a', '.flac', '.ogg', '.aac'] def get_engine_info(self): """ Get information about the ASR engine. Returns: dict: Engine information including model and device details """ return { "engine": "ASR Engine", "version": "1.0.0", "model": self.model_name, "device": "GPU" if torch.cuda.is_available() else "CPU", "sample_rate": self.sample_rate, "supported_formats": self.get_supported_formats(), "framework": "transformers + whisper" } def health_check(self): """ Perform a health check on the ASR engine. Returns: dict: Health status information """ try: # Test with a simple sine wave test_audio = np.sin(2 * np.pi * 440 * np.linspace(0, 1, self.sample_rate)) test_audio = (test_audio * 32767).astype(np.int16) # Convert to bytes test_bytes = test_audio.tobytes() # Try transcription (should return empty or noise) self.transcriber(test_audio.astype(np.float32)) return { "status": "healthy", "message": "ASR engine is functioning correctly", "model_loaded": True } except Exception as e: logger.error(f"Health check failed: {e}") return { "status": "unhealthy", "message": f"ASR engine health check failed: {str(e)}", "model_loaded": hasattr(self, 'transcriber') and self.transcriber is not None } def main(): """Example usage and testing of the ASR Engine.""" try: # Initialize the engine logger.info("Testing ASR Engine...") engine = ASREngine() # Print engine info info = engine.get_engine_info() logger.info(f"Engine Info: {info}") # Perform health check health = engine.health_check() logger.info(f"Health Check: {health}") # Test with a simple audio file if available test_files = ["test_audio.wav", "sample.wav", "test.wav"] for test_file in test_files: if os.path.exists(test_file): try: transcription = engine.transcribe_file(test_file) logger.info(f"🎯 Transcription: {transcription}") break except Exception as e: logger.error(f"Failed to transcribe {test_file}: {e}") else: logger.info("No test audio files found. Engine is ready for use.") logger.info("🎉 ASR Engine testing completed!") except Exception as e: logger.error(f"❌ ASR Engine test failed: {e}") if __name__ == "__main__": main()