Spaces:

Chillarmo
/

Voice_Cloning_with_OuteTTS

Running

App Files Files Community

Chillarmo commited on Nov 5, 2024

Commit

776e91e

•

1 Parent(s): 287cc1c

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -30

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import gradio as gr
 import torch
-import torch.nn as nn
 import os
 from outetts.v0_1.interface import InterfaceHF
 import soundfile as sf
@@ -15,13 +14,10 @@ torch.set_grad_enabled(False)  # Disable gradient computation
 class OptimizedTTSInterface:
     def __init__(self, model_name="OuteAI/OuteTTS-0.1-350M"):
         self.interface = InterfaceHF(model_name)
-        # Quantize the model to INT8
-        self.interface.model = torch.quantization.quantize_dynamic(
-            self.interface.model, {nn.Linear}, dtype=torch.qint8
-        )
-        # Move model to CPU and enable inference mode
-        self.interface.model.cpu()
-        self.interface.model.eval()
     def create_speaker(self, *args, **kwargs):
         with torch.inference_mode():
@@ -33,19 +29,25 @@ class OptimizedTTSInterface:
 def initialize_models():
     """Initialize the OptimizedTTS and Faster-Whisper models"""
-    # Use cached models if available
     cache_dir = Path("model_cache")
     cache_dir.mkdir(exist_ok=True)
-    tts_interface = OptimizedTTSInterface()
-    # Initialize Whisper with maximum optimization
     asr_model = WhisperModel("tiny",
                             device="cpu",
                             compute_type="int8",
                             num_workers=1,
                             cpu_threads=2,
                             download_root=str(cache_dir))
     return tts_interface, asr_model
 def transcribe_audio(audio_path):
@@ -79,6 +81,9 @@ def preprocess_audio(audio_path):
         if len(data.shape) > 1:
             data = data.mean(axis=1)
         # Save preprocessed audio
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
         sf.write(temp_file.name, data, sr)
@@ -99,19 +104,20 @@ def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.
                 return None, reference_text
         # Create speaker from reference audio
-        speaker = TTS_INTERFACE.create_speaker(
-            processed_audio,
-            reference_text
-        )
-        # Generate speech with cloned voice
-        output = TTS_INTERFACE.generate(
-            text=text_to_speak,
-            speaker=speaker,
-            temperature=temperature,
-            repetition_penalty=repetition_penalty,
-            max_lenght=4096
-        )
         # Clean up preprocessed audio if it was created
         if processed_audio != audio_path:
@@ -133,10 +139,10 @@ def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.
                 pass
         return None, f"Error: {str(e)}"
-print("Initializing models...")
 # Initialize models globally
 TTS_INTERFACE, ASR_MODEL = initialize_models()
-print("Models initialized!")
 # Create Gradio interface
 with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
@@ -146,14 +152,15 @@ with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
     Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription),
     and enter the new text you want to be spoken in the cloned voice.
-    Note: For best results, use clear audio with minimal background noise.
     """)
     with gr.Row():
         with gr.Column():
             audio_input = gr.Audio(
                 label="Upload Reference Audio",
-                type="filepath"
             )
             reference_text = gr.Textbox(
                 label="Reference Text (leave blank for auto-transcription)",
@@ -194,10 +201,10 @@ with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
     gr.Markdown("""
     ### Optimization Notes:
-    - Using INT8 quantization for efficient CPU usage
-    - Optimized audio preprocessing
-    - Cached model loading
     - Memory-efficient inference
     ### Tips for best results:
     1. Use clear, high-quality reference audio

 import gradio as gr
 import torch
 import os
 from outetts.v0_1.interface import InterfaceHF
 import soundfile as sf
 class OptimizedTTSInterface:
     def __init__(self, model_name="OuteAI/OuteTTS-0.1-350M"):
         self.interface = InterfaceHF(model_name)
+        # Apply FP16 optimization where possible
+        self.interface.model = self.interface.model.half().float()
+        # Cache commonly used attributes
+        self.tokenizer = self.interface.model.tokenizer
     def create_speaker(self, *args, **kwargs):
         with torch.inference_mode():
 def initialize_models():
     """Initialize the OptimizedTTS and Faster-Whisper models"""
+    # Create cache directory for models
     cache_dir = Path("model_cache")
     cache_dir.mkdir(exist_ok=True)
+    # Set environment variables for better performance
+    os.environ['OMP_NUM_THREADS'] = '4'
+    os.environ['MKL_NUM_THREADS'] = '4'
+    print("Loading ASR model...")
     asr_model = WhisperModel("tiny",
                             device="cpu",
                             compute_type="int8",
                             num_workers=1,
                             cpu_threads=2,
                             download_root=str(cache_dir))
+    print("Loading TTS model...")
+    tts_interface = OptimizedTTSInterface()
     return tts_interface, asr_model
 def transcribe_audio(audio_path):
         if len(data.shape) > 1:
             data = data.mean(axis=1)
+        # Normalize audio
+        data = data / max(abs(data.max()), abs(data.min()))
         # Save preprocessed audio
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
         sf.write(temp_file.name, data, sr)
                 return None, reference_text
         # Create speaker from reference audio
+        with torch.inference_mode():
+            speaker = TTS_INTERFACE.create_speaker(
+                processed_audio,
+                reference_text
+            )
+            # Generate speech with cloned voice
+            output = TTS_INTERFACE.generate(
+                text=text_to_speak,
+                speaker=speaker,
+                temperature=temperature,
+                repetition_penalty=repetition_penalty,
+                max_lenght=4096
+            )
         # Clean up preprocessed audio if it was created
         if processed_audio != audio_path:
                 pass
         return None, f"Error: {str(e)}"
+print("Starting initialization...")
 # Initialize models globally
 TTS_INTERFACE, ASR_MODEL = initialize_models()
+print("Models initialized successfully!")
 # Create Gradio interface
 with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
     Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription),
     and enter the new text you want to be spoken in the cloned voice.
+    Note: First run may take longer while models are being cached.
     """)
     with gr.Row():
         with gr.Column():
             audio_input = gr.Audio(
                 label="Upload Reference Audio",
+                type="filepath",
+                source="microphone"
             )
             reference_text = gr.Textbox(
                 label="Reference Text (leave blank for auto-transcription)",
     gr.Markdown("""
     ### Optimization Notes:
+    - Optimized for CPU performance
+    - Model caching enabled
     - Memory-efficient inference
+    - Automatic audio preprocessing
     ### Tips for best results:
     1. Use clear, high-quality reference audio