| """ |
| voice_interface.py — Voice I/O for the Computer Agent |
| ====================================================== |
| Speech-to-Text (Whisper / Faster-Whisper) and TTS (HF Inference API) |
| """ |
|
|
| import os |
| import io |
| import tempfile |
| import base64 |
| from typing import Optional, Dict, Any |
|
|
| import numpy as np |
|
|
| |
| try: |
| from faster_whisper import WhisperModel |
| HAS_FASTER_WHISPER = True |
| except ImportError: |
| HAS_FASTER_WHISPER = False |
|
|
| |
| try: |
| from huggingface_hub import InferenceClient |
| HAS_HF_INFERENCE = True |
| except ImportError: |
| HAS_HF_INFERENCE = False |
|
|
|
|
| class VoiceInterface: |
| """Handles audio input (STT) and output (TTS) for the agent.""" |
|
|
| def __init__( |
| self, |
| stt_model_size: str = "base", |
| tts_model: str = "hexgrad/Kokoro-82M", |
| hf_token: Optional[str] = None, |
| ): |
| self.stt_model_size = stt_model_size |
| self.tts_model = tts_model |
| self.hf_token = hf_token or os.getenv("HF_TOKEN") |
| self._stt: Optional[Any] = None |
| self._tts_client: Optional[Any] = None |
|
|
| |
| |
| |
|
|
| def _load_stt(self) -> Any: |
| if self._stt is None: |
| if HAS_FASTER_WHISPER: |
| |
| self._stt = WhisperModel(self.stt_model_size, device="cpu", compute_type="int8") |
| else: |
| raise RuntimeError("faster-whisper not installed. Run: pip install faster-whisper") |
| return self._stt |
|
|
| def transcribe(self, audio_np: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]: |
| """Transcribe audio waveform to text. |
| audio_np: numpy array of float32 audio samples |
| """ |
| model = self._load_stt() |
| |
| import soundfile as sf |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: |
| sf.write(f.name, audio_np, sample_rate) |
| segments, info = model.transcribe(f.name, beam_size=5) |
| text = " ".join([seg.text for seg in segments]) |
| os.unlink(f.name) |
| return { |
| "text": text.strip(), |
| "language": info.language, |
| "probability": info.language_probability, |
| } |
|
|
| def transcribe_from_file(self, file_path: str) -> Dict[str, Any]: |
| model = self._load_stt() |
| segments, info = model.transcribe(file_path, beam_size=5) |
| text = " ".join([seg.text for seg in segments]) |
| return { |
| "text": text.strip(), |
| "language": info.language, |
| "probability": info.language_probability, |
| } |
|
|
| |
| |
| |
|
|
| def _load_tts(self) -> Any: |
| if self._tts_client is None: |
| if HAS_HF_INFERENCE: |
| self._tts_client = InferenceClient(model=self.tts_model, token=self.hf_token) |
| else: |
| raise RuntimeError("huggingface_hub not installed") |
| return self._tts_client |
|
|
| def synthesize(self, text: str, voice: str = "af") -> bytes: |
| """Synthesize text to speech bytes. |
| Returns raw audio bytes (usually WAV or MP3 depending on model). |
| """ |
| client = self._load_tts() |
| try: |
| audio = client.text_to_speech(text, model=self.tts_model) |
| if hasattr(audio, "read"): |
| return audio.read() |
| return audio |
| except Exception as e: |
| |
| alt_client = InferenceClient(token=self.hf_token) |
| audio = alt_client.text_to_speech(text, model="espnet/kan-bayashi_ljspeech_vits") |
| if hasattr(audio, "read"): |
| return audio.read() |
| return audio |
|
|
| def synthesize_to_file(self, text: str, output_path: str, voice: str = "af") -> str: |
| audio_bytes = self.synthesize(text, voice) |
| with open(output_path, "wb") as f: |
| f.write(audio_bytes) |
| return output_path |
|
|
| |
| |
| |
|
|
| def process_gradio_audio(self, audio_tuple) -> str: |
| """Process Gradio audio input (tuple of sample_rate, numpy_array).""" |
| if audio_tuple is None: |
| return "" |
| sample_rate, audio_np = audio_tuple |
| |
| if audio_np.ndim > 1: |
| audio_np = audio_np.mean(axis=1) |
| if audio_np.dtype != np.float32: |
| audio_np = audio_np.astype(np.float32) |
| result = self.transcribe(audio_np, sample_rate) |
| return result["text"] |
|
|