Deploy Computer Agent v2.0 full stack

31e5b3a verified 16 days ago

4.97 kB

	"""
	voice_interface.py — Voice I/O for the Computer Agent
	======================================================
	Speech-to-Text (Whisper / Faster-Whisper) and TTS (HF Inference API)
	"""

	import os
	import io
	import tempfile
	import base64
	from typing import Optional, Dict, Any

	import numpy as np

	# STT
	try:
	from faster_whisper import WhisperModel
	HAS_FASTER_WHISPER = True
	except ImportError:
	HAS_FASTER_WHISPER = False

	# TTS via HF Inference
	try:
	from huggingface_hub import InferenceClient
	HAS_HF_INFERENCE = True
	except ImportError:
	HAS_HF_INFERENCE = False


	class VoiceInterface:
	"""Handles audio input (STT) and output (TTS) for the agent."""

	def __init__(
	self,
	stt_model_size: str = "base",
	tts_model: str = "hexgrad/Kokoro-82M",
	hf_token: Optional[str] = None,
	):
	self.stt_model_size = stt_model_size
	self.tts_model = tts_model
	self.hf_token = hf_token or os.getenv("HF_TOKEN")
	self._stt: Optional[Any] = None
	self._tts_client: Optional[Any] = None

	# ------------------------------------------------------------------
	# STT
	# ------------------------------------------------------------------

	def _load_stt(self) -> Any:
	if self._stt is None:
	if HAS_FASTER_WHISPER:
	# Use CPU for Spaces compatibility; auto-detect compute type
	self._stt = WhisperModel(self.stt_model_size, device="cpu", compute_type="int8")
	else:
	raise RuntimeError("faster-whisper not installed. Run: pip install faster-whisper")
	return self._stt

	def transcribe(self, audio_np: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]:
	"""Transcribe audio waveform to text.
	audio_np: numpy array of float32 audio samples
	"""
	model = self._load_stt()
	# faster-whisper expects a file path or bytes; save to temp wav
	import soundfile as sf
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	sf.write(f.name, audio_np, sample_rate)
	segments, info = model.transcribe(f.name, beam_size=5)
	text = " ".join([seg.text for seg in segments])
	os.unlink(f.name)
	return {
	"text": text.strip(),
	"language": info.language,
	"probability": info.language_probability,
	}

	def transcribe_from_file(self, file_path: str) -> Dict[str, Any]:
	model = self._load_stt()
	segments, info = model.transcribe(file_path, beam_size=5)
	text = " ".join([seg.text for seg in segments])
	return {
	"text": text.strip(),
	"language": info.language,
	"probability": info.language_probability,
	}

	# ------------------------------------------------------------------
	# TTS
	# ------------------------------------------------------------------

	def _load_tts(self) -> Any:
	if self._tts_client is None:
	if HAS_HF_INFERENCE:
	self._tts_client = InferenceClient(model=self.tts_model, token=self.hf_token)
	else:
	raise RuntimeError("huggingface_hub not installed")
	return self._tts_client

	def synthesize(self, text: str, voice: str = "af") -> bytes:
	"""Synthesize text to speech bytes.
	Returns raw audio bytes (usually WAV or MP3 depending on model).
	"""
	client = self._load_tts()
	try:
	audio = client.text_to_speech(text, model=self.tts_model)
	if hasattr(audio, "read"):
	return audio.read()
	return audio
	except Exception as e:
	# Fallback to standard TTS endpoint
	alt_client = InferenceClient(token=self.hf_token)
	audio = alt_client.text_to_speech(text, model="espnet/kan-bayashi_ljspeech_vits")
	if hasattr(audio, "read"):
	return audio.read()
	return audio

	def synthesize_to_file(self, text: str, output_path: str, voice: str = "af") -> str:
	audio_bytes = self.synthesize(text, voice)
	with open(output_path, "wb") as f:
	f.write(audio_bytes)
	return output_path

	# ------------------------------------------------------------------
	# Gradio helpers
	# ------------------------------------------------------------------

	def process_gradio_audio(self, audio_tuple) -> str:
	"""Process Gradio audio input (tuple of sample_rate, numpy_array)."""
	if audio_tuple is None:
	return ""
	sample_rate, audio_np = audio_tuple
	# Convert to mono float32 if needed
	if audio_np.ndim > 1:
	audio_np = audio_np.mean(axis=1)
	if audio_np.dtype != np.float32:
	audio_np = audio_np.astype(np.float32)
	result = self.transcribe(audio_np, sample_rate)
	return result["text"]