Spaces:
Sleeping
Sleeping
| """ | |
| Enhanced voice modulation with open-source tools for GRIT Voice Agent | |
| Provides emotion-based voice modulation and alternative TTS options | |
| """ | |
| import os | |
| import json | |
| import logging | |
| import numpy as np | |
| import tempfile | |
| import subprocess | |
| from typing import Dict, List, Optional, Union, Any | |
| from datetime import datetime | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| # Try to import audio processing libraries | |
| try: | |
| import librosa | |
| import soundfile as sf | |
| AUDIO_LIBS_AVAILABLE = True | |
| except ImportError: | |
| AUDIO_LIBS_AVAILABLE = False | |
| logger.warning("Audio libraries not available. Install with: pip install librosa soundfile") | |
| # Try to import Bark if available | |
| try: | |
| from bark import SAMPLE_RATE, generate_audio, preload_models | |
| BARK_AVAILABLE = True | |
| except ImportError: | |
| BARK_AVAILABLE = False | |
| logger.warning("Bark TTS not available. Install with: pip install git+https://github.com/suno-ai/bark.git") | |
| # Default paths | |
| DEFAULT_OUTPUT = "output.wav" | |
| DEFAULT_VOICE = "voices/315_taylor.wav" | |
| # Emotion voice parameters | |
| EMOTION_VOICE_PARAMS = { | |
| "joy": { | |
| "speed": 1.15, | |
| "pitch_shift": 0.5, | |
| "energy_boost": 1.2 | |
| }, | |
| "sadness": { | |
| "speed": 0.85, | |
| "pitch_shift": -1.0, | |
| "energy_boost": 0.8 | |
| }, | |
| "anger": { | |
| "speed": 1.1, | |
| "pitch_shift": -0.5, | |
| "energy_boost": 1.5 | |
| }, | |
| "fear": { | |
| "speed": 1.05, | |
| "pitch_shift": 0.3, | |
| "energy_boost": 0.9 | |
| }, | |
| "surprise": { | |
| "speed": 1.2, | |
| "pitch_shift": 1.0, | |
| "energy_boost": 1.3 | |
| }, | |
| "neutral": { | |
| "speed": 1.0, | |
| "pitch_shift": 0.0, | |
| "energy_boost": 1.0 | |
| } | |
| } | |
| class VoiceModulator: | |
| """Apply emotion-based modulation to voice audio""" | |
| def __init__(self): | |
| self.available = AUDIO_LIBS_AVAILABLE | |
| def apply_modulation(self, | |
| audio_path: str, | |
| output_path: str, | |
| emotion: str = "neutral") -> str: | |
| """ | |
| Apply emotion-based modulation to audio file | |
| Args: | |
| audio_path: Path to input audio file | |
| output_path: Path to save modulated audio | |
| emotion: Emotion to apply (joy, sadness, anger, fear, surprise, neutral) | |
| Returns: | |
| Path to modulated audio file | |
| """ | |
| if not self.available: | |
| logger.error("Audio libraries not available") | |
| return audio_path | |
| if not os.path.exists(audio_path): | |
| logger.error(f"Audio file not found: {audio_path}") | |
| return audio_path | |
| try: | |
| # Get emotion parameters | |
| params = EMOTION_VOICE_PARAMS.get(emotion, EMOTION_VOICE_PARAMS["neutral"]) | |
| # Load audio | |
| y, sr = librosa.load(audio_path, sr=None) | |
| # Apply speed change (time stretch) | |
| if params["speed"] != 1.0: | |
| y_stretched = librosa.effects.time_stretch(y, rate=params["speed"]) | |
| else: | |
| y_stretched = y | |
| # Apply pitch shift | |
| if params["pitch_shift"] != 0.0: | |
| y_shifted = librosa.effects.pitch_shift(y_stretched, sr=sr, n_steps=params["pitch_shift"]) | |
| else: | |
| y_shifted = y_stretched | |
| # Apply energy boost | |
| if params["energy_boost"] != 1.0: | |
| y_boosted = y_shifted * params["energy_boost"] | |
| # Normalize if needed | |
| if np.max(np.abs(y_boosted)) > 1.0: | |
| y_boosted = y_boosted / np.max(np.abs(y_boosted)) | |
| else: | |
| y_boosted = y_shifted | |
| # Save modulated audio | |
| sf.write(output_path, y_boosted, sr) | |
| logger.info(f"Applied {emotion} modulation to {audio_path}") | |
| return output_path | |
| except Exception as e: | |
| logger.error(f"Error applying voice modulation: {e}") | |
| return audio_path | |
| class BarkTTS: | |
| """Bark text-to-speech implementation""" | |
| def __init__(self): | |
| self.available = BARK_AVAILABLE | |
| if self.available: | |
| try: | |
| # Preload models | |
| preload_models() | |
| logger.info("Bark TTS models loaded") | |
| except Exception as e: | |
| logger.error(f"Failed to load Bark TTS models: {e}") | |
| self.available = False | |
| def generate_speech(self, | |
| text: str, | |
| output_path: str = DEFAULT_OUTPUT, | |
| speaker_id: str = None, | |
| emotion: str = "neutral") -> str: | |
| """ | |
| Generate speech using Bark TTS | |
| Args: | |
| text: Text to convert to speech | |
| output_path: Path to save audio file | |
| speaker_id: Speaker ID or preset | |
| emotion: Emotion to apply | |
| Returns: | |
| Path to generated audio file | |
| """ | |
| if not self.available: | |
| logger.error("Bark TTS not available") | |
| return None | |
| try: | |
| # Apply emotion to prompt | |
| emotion_prompts = { | |
| "joy": "with an excited and happy tone", | |
| "sadness": "with a sad and melancholic tone", | |
| "anger": "with an angry and intense tone", | |
| "fear": "with a fearful and nervous tone", | |
| "surprise": "with a surprised and amazed tone", | |
| "neutral": "with a neutral and calm tone" | |
| } | |
| emotion_prompt = emotion_prompts.get(emotion, "") | |
| # Create speaker prompt | |
| if speaker_id: | |
| prompt = f"[{speaker_id}] {text} {emotion_prompt}" | |
| else: | |
| prompt = f"{text} {emotion_prompt}" | |
| # Generate audio | |
| audio_array = generate_audio(prompt) | |
| # Save to file | |
| sf.write(output_path, audio_array, SAMPLE_RATE) | |
| logger.info(f"Generated speech with Bark TTS: {output_path}") | |
| return output_path | |
| except Exception as e: | |
| logger.error(f"Error generating speech with Bark TTS: {e}") | |
| return None | |
| class PiperTTS: | |
| """Piper TTS implementation (command-line based)""" | |
| def __init__(self, model_dir: str = "piper_models"): | |
| self.model_dir = model_dir | |
| self.available = self._check_piper() | |
| def _check_piper(self) -> bool: | |
| """Check if Piper is installed""" | |
| try: | |
| result = subprocess.run(["piper", "--help"], | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.PIPE) | |
| if result.returncode == 0: | |
| logger.info("Piper TTS is available") | |
| return True | |
| else: | |
| logger.warning("Piper TTS command not found") | |
| return False | |
| except Exception as e: | |
| logger.error(f"Error checking Piper TTS: {e}") | |
| return False | |
| def _get_model_path(self, voice: str = "en_US-lessac-medium") -> str: | |
| """Get path to Piper model""" | |
| model_path = os.path.join(self.model_dir, f"{voice}.onnx") | |
| if os.path.exists(model_path): | |
| return model_path | |
| else: | |
| logger.warning(f"Piper model not found: {model_path}") | |
| return None | |
| def generate_speech(self, | |
| text: str, | |
| output_path: str = DEFAULT_OUTPUT, | |
| voice: str = "en_US-lessac-medium", | |
| emotion: str = "neutral") -> str: | |
| """ | |
| Generate speech using Piper TTS | |
| Args: | |
| text: Text to convert to speech | |
| output_path: Path to save audio file | |
| voice: Voice model to use | |
| emotion: Emotion to apply (used for post-processing) | |
| Returns: | |
| Path to generated audio file | |
| """ | |
| if not self.available: | |
| logger.error("Piper TTS not available") | |
| return None | |
| try: | |
| # Get model path | |
| model_path = self._get_model_path(voice) | |
| if not model_path: | |
| return None | |
| # Create temporary text file | |
| with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as temp: | |
| temp.write(text) | |
| temp_path = temp.name | |
| # Generate speech | |
| command = [ | |
| "piper", | |
| "--model", model_path, | |
| "--output_file", output_path, | |
| "--file", temp_path | |
| ] | |
| result = subprocess.run(command, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.PIPE) | |
| # Clean up temp file | |
| os.unlink(temp_path) | |
| if result.returncode == 0: | |
| logger.info(f"Generated speech with Piper TTS: {output_path}") | |
| # Apply emotion modulation if needed | |
| if emotion != "neutral": | |
| modulator = VoiceModulator() | |
| modulated_path = f"modulated_{os.path.basename(output_path)}" | |
| modulator.apply_modulation(output_path, modulated_path, emotion) | |
| os.replace(modulated_path, output_path) | |
| return output_path | |
| else: | |
| logger.error(f"Piper TTS error: {result.stderr.decode()}") | |
| return None | |
| except Exception as e: | |
| logger.error(f"Error generating speech with Piper TTS: {e}") | |
| return None | |
| class EnhancedVoice: | |
| """Enhanced voice generation with emotion support""" | |
| def __init__(self): | |
| self.modulator = VoiceModulator() | |
| self.bark_tts = BarkTTS() if BARK_AVAILABLE else None | |
| self.piper_tts = PiperTTS() | |
| # Check if XTTS is available | |
| try: | |
| from TTS.api import TTS | |
| self.xtts_available = True | |
| except ImportError: | |
| self.xtts_available = False | |
| logger.warning("XTTS not available") | |
| def generate_speech(self, | |
| text: str, | |
| output_path: str = DEFAULT_OUTPUT, | |
| voice_file: str = DEFAULT_VOICE, | |
| engine: str = "xtts", | |
| emotion: str = "neutral", | |
| language: str = "en") -> str: | |
| """ | |
| Generate speech with emotion | |
| Args: | |
| text: Text to convert to speech | |
| output_path: Path to save audio file | |
| voice_file: Path to reference voice file (for XTTS) | |
| engine: TTS engine to use (xtts, bark, piper) | |
| emotion: Emotion to apply | |
| language: Language code | |
| Returns: | |
| Path to generated audio file | |
| """ | |
| try: | |
| result_path = None | |
| # Generate speech with selected engine | |
| if engine == "bark" and self.bark_tts and self.bark_tts.available: | |
| result_path = self.bark_tts.generate_speech( | |
| text=text, | |
| output_path=output_path, | |
| emotion=emotion | |
| ) | |
| elif engine == "piper" and self.piper_tts and self.piper_tts.available: | |
| result_path = self.piper_tts.generate_speech( | |
| text=text, | |
| output_path=output_path, | |
| emotion=emotion | |
| ) | |
| elif engine == "xtts" and self.xtts_available: | |
| # Use original XTTS | |
| from TTS.api import TTS | |
| # Get voice parameters for emotion | |
| params = EMOTION_VOICE_PARAMS.get(emotion, EMOTION_VOICE_PARAMS["neutral"]) | |
| # Load TTS model | |
| tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2") | |
| # Generate speech | |
| tts.tts_to_file( | |
| text=text, | |
| file_path=output_path, | |
| speaker_wav=voice_file, | |
| language=language, | |
| speed=params["speed"] | |
| ) | |
| result_path = output_path | |
| # Apply additional modulation if needed | |
| if emotion != "neutral" and self.modulator.available: | |
| modulated_path = f"modulated_{os.path.basename(output_path)}" | |
| self.modulator.apply_modulation(output_path, modulated_path, emotion) | |
| os.replace(modulated_path, output_path) | |
| else: | |
| logger.error(f"No available TTS engine for {engine}") | |
| return None | |
| return result_path | |
| except Exception as e: | |
| logger.error(f"Error generating enhanced speech: {e}") | |
| return None | |
| # Singleton instance | |
| enhanced_voice = EnhancedVoice() | |
| def generate_speech(text: str, | |
| output_path: str = DEFAULT_OUTPUT, | |
| voice_file: str = DEFAULT_VOICE, | |
| engine: str = "xtts", | |
| emotion: str = "neutral", | |
| language: str = "en") -> str: | |
| """Generate speech with emotion using available TTS engines""" | |
| return enhanced_voice.generate_speech( | |
| text=text, | |
| output_path=output_path, | |
| voice_file=voice_file, | |
| engine=engine, | |
| emotion=emotion, | |
| language=language | |
| ) | |
| def apply_voice_modulation(audio_path: str, | |
| output_path: str, | |
| emotion: str = "neutral") -> str: | |
| """Apply emotion-based modulation to existing audio file""" | |
| modulator = VoiceModulator() | |
| return modulator.apply_modulation(audio_path, output_path, emotion) | |
| # Example usage | |
| if __name__ == "__main__": | |
| # Test with different emotions | |
| test_texts = { | |
| "joy": "I'm so excited to share this amazing news with you! We've achieved our goals!", | |
| "sadness": "Unfortunately, I have to inform you that we didn't meet our targets this quarter.", | |
| "anger": "This is completely unacceptable! We need to address this issue immediately!", | |
| "surprise": "Wow! I can't believe what just happened! This is incredible!", | |
| "neutral": "Let me provide you with an update on our current progress." | |
| } | |
| for emotion, text in test_texts.items(): | |
| print(f"Testing {emotion}...") | |
| # Try different engines | |
| for engine in ["xtts", "bark", "piper"]: | |
| output_path = f"{engine}_{emotion}.wav" | |
| result = generate_speech( | |
| text=text, | |
| output_path=output_path, | |
| engine=engine, | |
| emotion=emotion | |
| ) | |
| if result: | |
| print(f"Generated speech with {engine}: {output_path}") | |
| else: | |
| print(f"Failed to generate speech with {engine}") | |