Spaces:

drvsbrkcn
/

EceMotion_Pictures

Paused

App Files Files Community

EceMotion_Pictures / utils_audio.py

drvsbrkcn

Upload 3 files

869d082 verified about 1 month ago

raw

history blame contribute delete

10.1 kB

	"""
	Audio processing utilities for EceMotion Pictures.
	Enhanced text-to-speech generation with robust error handling and fallbacks.
	"""

	import numpy as np
	import logging
	import os
	from typing import Tuple, Optional, Dict, Any

	from config import (
	MODEL_AUDIO, MODEL_CONFIGS, AUDIO_SAMPLE_RATE, get_device, get_safe_model_name
	)

	logger = logging.getLogger(__name__)

	# Global model cache
	_tts_pipe = None
	_current_tts_model = None

	def get_tts_pipe(model_name: str = MODEL_AUDIO, device: str = None):
	"""Get or create TTS pipeline with lazy loading and model switching."""
	global _tts_pipe, _current_tts_model

	if device is None:
	device = get_device()

	# Use safe model name
	safe_model_name = get_safe_model_name(model_name, "audio")

	if _tts_pipe is None or _current_tts_model != safe_model_name:
	logger.info(f"Loading TTS model: {safe_model_name}")

	try:
	if "f5-tts" in safe_model_name.lower():
	# Try F5-TTS first
	_tts_pipe = _load_f5_tts(safe_model_name, device)
	else:
	# Use standard TTS pipeline
	_tts_pipe = _load_standard_tts(safe_model_name, device)

	if _tts_pipe is not None:
	_current_tts_model = safe_model_name
	logger.info(f"TTS model {safe_model_name} loaded successfully")
	else:
	raise RuntimeError("Failed to load any TTS model")

	except Exception as e:
	logger.error(f"Failed to load {safe_model_name}: {e}")
	# Fallback to original model
	_tts_pipe = _load_standard_tts("parler-tts/parler-tts-mini-v1", device)
	_current_tts_model = "parler-tts/parler-tts-mini-v1"

	return _tts_pipe

	def _load_f5_tts(model_name: str, device: str):
	"""Load F5-TTS model."""
	try:
	from transformers import pipeline

	pipe = pipeline(
	"text-to-speech",
	model=model_name,
	torch_dtype="auto",
	device_map=device if device == "cuda" else None
	)

	return pipe

	except Exception as e:
	logger.error(f"Failed to load F5-TTS: {e}")
	return None

	def _load_standard_tts(model_name: str, device: str):
	"""Load standard TTS model."""
	try:
	from transformers import pipeline
	import torch

	# Fix device string - convert "auto" to proper device
	if device == "auto":
	device = "cuda" if torch.cuda.is_available() else "cpu"

	pipe = pipeline(
	"text-to-speech",
	model=model_name,
	torch_dtype=torch.float16 if device == "cuda" else torch.float32
	)

	if device == "cuda":
	pipe = pipe.to(device)

	return pipe

	except Exception as e:
	logger.error(f"Failed to load standard TTS: {e}")
	return None

	def synth_voice(text: str, voice_prompt: str, sr: int = AUDIO_SAMPLE_RATE,
	model_name: str = MODEL_AUDIO, device: str = None) -> Tuple[int, np.ndarray]:
	"""
	Generate speech from text with enhanced TTS support.
	"""
	if device is None:
	device = get_device()

	tts = get_tts_pipe(model_name, device)
	model_config = MODEL_CONFIGS.get(_current_tts_model, {})

	# Validate text length
	max_length = model_config.get("max_text_length", 500)
	min_length = model_config.get("min_text_length", 10)

	if len(text) > max_length:
	logger.warning(f"Text too long ({len(text)} chars), truncating to {max_length}")
	text = text[:max_length]
	elif len(text) < min_length:
	logger.warning(f"Text too short ({len(text)} chars), padding")
	text = text + " " * (min_length - len(text))

	try:
	if "f5-tts" in _current_tts_model.lower():
	# F5-TTS specific generation
	result = tts(
	text=text,
	voice_preset=voice_prompt,
	return_tensors="pt"
	)
	wav = result["audio"].numpy().flatten()
	else:
	# Standard pipeline (Parler-TTS, etc.)
	result = tts({"text": text, "voice_preset": voice_prompt})
	wav = result["audio"]

	# Ensure proper format
	if hasattr(wav, 'numpy'):
	wav = wav.numpy()
	elif hasattr(wav, 'detach'):
	wav = wav.detach().numpy()

	# Normalize audio
	wav = normalize_audio(wav)

	# Resample if needed
	if sr != AUDIO_SAMPLE_RATE:
	wav = _resample_audio(wav, AUDIO_SAMPLE_RATE, sr)

	logger.info(f"Generated audio: {len(wav)/sr:.2f}s at {sr}Hz")
	return sr, wav.astype(np.float32)

	except Exception as e:
	logger.error(f"Voice synthesis failed: {e}")
	# Return fallback audio
	return _create_fallback_audio(text, sr)

	def _resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
	"""Resample audio using available methods."""
	try:
	import librosa
	return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
	except ImportError:
	# Simple resampling without librosa
	ratio = target_sr / orig_sr
	new_length = int(len(audio) * ratio)
	return np.interp(
	np.linspace(0, len(audio), new_length),
	np.arange(len(audio)),
	audio
	)

	def _create_fallback_audio(text: str, sr: int) -> Tuple[int, np.ndarray]:
	"""Create fallback audio when TTS fails."""
	try:
	# Create a simple tone based on text length
	duration = max(1.0, len(text) / 20.0) # Rough estimate
	t = np.linspace(0, duration, int(sr * duration), endpoint=False)

	# Generate a simple tone
	frequency = 440.0 # A4 note
	wav = 0.1 * np.sin(2 * np.pi * frequency * t)

	# Add some variation
	wav += 0.05 * np.sin(2 * np.pi * frequency * 1.5 * t)

	logger.info(f"Created fallback audio: {duration:.2f}s")
	return sr, wav.astype(np.float32)

	except Exception as e:
	logger.error(f"Failed to create fallback audio: {e}")
	# Last resort: silence
	duration = 2.0
	wav = np.zeros(int(sr * duration))
	return sr, wav.astype(np.float32)

	def normalize_audio(audio: np.ndarray, target_lufs: float = -23.0) -> np.ndarray:
	"""Normalize audio to broadcast standards."""
	# Simple peak normalization first
	if np.max(np.abs(audio)) > 0:
	audio = audio / np.max(np.abs(audio)) * 0.95

	# Apply gentle compression
	audio = apply_compression(audio)

	return audio

	def apply_compression(audio: np.ndarray, ratio: float = 3.0, threshold: float = 0.7) -> np.ndarray:
	"""Apply gentle compression for broadcast quality."""
	# Simple soft-knee compression
	compressed = np.copy(audio)

	# Above threshold, apply compression
	above_threshold = np.abs(audio) > threshold
	compressed[above_threshold] = np.sign(audio[above_threshold]) * (
	threshold + (np.abs(audio[above_threshold]) - threshold) / ratio
	)

	return compressed

	def retro_bed(duration_s: float, sr: int = AUDIO_SAMPLE_RATE, bpm: int = 92):
	"""Generate retro synth background music."""
	try:
	t = np.linspace(0, duration_s, int(sr * duration_s), endpoint=False)

	# Chord progression root frequencies (A minor style)
	freqs = [220.0, 174.61, 196.0, 146.83]
	seg_len = int(len(t) / len(freqs)) if len(freqs) else len(t)
	sig = np.zeros_like(t)

	for i, f0 in enumerate(freqs):
	tri_t = t[i * seg_len:(i + 1) * seg_len]
	tri = 2 * np.abs(2 * ((tri_t * f0) % 1) - 1) - 1
	sig[i * seg_len:(i + 1) * seg_len] = 0.15 * tri

	# Add tape noise
	noise = 0.01 * np.random.randn(len(t))
	bed = sig + noise

	# Apply gentle lowpass filter
	try:
	from scipy import signal
	b, a = signal.butter(3, 3000, 'low', fs=sr)
	bed = signal.lfilter(b, a, bed)
	except ImportError:
	# Simple averaging filter if scipy not available
	bed = np.convolve(bed, np.ones(5)/5, mode='same')

	return sr, bed.astype(np.float32)

	except Exception as e:
	logger.error(f"Failed to generate retro bed: {e}")
	# Return silence
	silence = np.zeros(int(sr * duration_s))
	return sr, silence.astype(np.float32)

	def mix_to_stereo(sr1, a, sr2, b, bed_gain=0.5):
	"""Mix two mono signals to stereo."""
	assert sr1 == sr2, "Sample rates must match"

	n = max(len(a), len(b))

	def pad(x):
	if len(x) < n:
	if len(x.shape) > 1: # Stereo
	padding = np.zeros((n - len(x), x.shape[1]))
	else: # Mono
	padding = np.zeros(n - len(x))
	x = np.concatenate([x, padding])
	return x

	a = pad(a)
	b = pad(b)

	left = a + bed_gain * b
	right = a * 0.9 + bed_gain * 0.9 * b

	if len(left.shape) == 1: # Mono to stereo
	stereo = np.stack([left, right], axis=1)
	else: # Already stereo
	stereo = np.stack([left, right], axis=1)

	return sr1, np.clip(stereo, -1.0, 1.0)

	def write_wav(path: str, sr: int, wav: np.ndarray):
	"""Write audio to WAV file."""
	try:
	import soundfile as sf
	sf.write(path, wav, sr)
	except ImportError:
	# Fallback using scipy
	try:
	from scipy.io import wavfile
	# Convert to 16-bit
	wav_16bit = (wav * 32767).astype(np.int16)
	wavfile.write(path, sr, wav_16bit)
	except ImportError:
	logger.error("No audio writing library available (soundfile or scipy)")
	raise RuntimeError("Cannot write audio file - no audio library available")