Chillarmo commited on
Commit
776e91e
1 Parent(s): 287cc1c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -30
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import gradio as gr
2
  import torch
3
- import torch.nn as nn
4
  import os
5
  from outetts.v0_1.interface import InterfaceHF
6
  import soundfile as sf
@@ -15,13 +14,10 @@ torch.set_grad_enabled(False) # Disable gradient computation
15
  class OptimizedTTSInterface:
16
  def __init__(self, model_name="OuteAI/OuteTTS-0.1-350M"):
17
  self.interface = InterfaceHF(model_name)
18
- # Quantize the model to INT8
19
- self.interface.model = torch.quantization.quantize_dynamic(
20
- self.interface.model, {nn.Linear}, dtype=torch.qint8
21
- )
22
- # Move model to CPU and enable inference mode
23
- self.interface.model.cpu()
24
- self.interface.model.eval()
25
 
26
  def create_speaker(self, *args, **kwargs):
27
  with torch.inference_mode():
@@ -33,19 +29,25 @@ class OptimizedTTSInterface:
33
 
34
  def initialize_models():
35
  """Initialize the OptimizedTTS and Faster-Whisper models"""
36
- # Use cached models if available
37
  cache_dir = Path("model_cache")
38
  cache_dir.mkdir(exist_ok=True)
39
 
40
- tts_interface = OptimizedTTSInterface()
 
 
41
 
42
- # Initialize Whisper with maximum optimization
43
  asr_model = WhisperModel("tiny",
44
  device="cpu",
45
  compute_type="int8",
46
  num_workers=1,
47
  cpu_threads=2,
48
  download_root=str(cache_dir))
 
 
 
 
49
  return tts_interface, asr_model
50
 
51
  def transcribe_audio(audio_path):
@@ -79,6 +81,9 @@ def preprocess_audio(audio_path):
79
  if len(data.shape) > 1:
80
  data = data.mean(axis=1)
81
 
 
 
 
82
  # Save preprocessed audio
83
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
84
  sf.write(temp_file.name, data, sr)
@@ -99,19 +104,20 @@ def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.
99
  return None, reference_text
100
 
101
  # Create speaker from reference audio
102
- speaker = TTS_INTERFACE.create_speaker(
103
- processed_audio,
104
- reference_text
105
- )
 
106
 
107
- # Generate speech with cloned voice
108
- output = TTS_INTERFACE.generate(
109
- text=text_to_speak,
110
- speaker=speaker,
111
- temperature=temperature,
112
- repetition_penalty=repetition_penalty,
113
- max_lenght=4096
114
- )
115
 
116
  # Clean up preprocessed audio if it was created
117
  if processed_audio != audio_path:
@@ -133,10 +139,10 @@ def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.
133
  pass
134
  return None, f"Error: {str(e)}"
135
 
136
- print("Initializing models...")
137
  # Initialize models globally
138
  TTS_INTERFACE, ASR_MODEL = initialize_models()
139
- print("Models initialized!")
140
 
141
  # Create Gradio interface
142
  with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
@@ -146,14 +152,15 @@ with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
146
  Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription),
147
  and enter the new text you want to be spoken in the cloned voice.
148
 
149
- Note: For best results, use clear audio with minimal background noise.
150
  """)
151
 
152
  with gr.Row():
153
  with gr.Column():
154
  audio_input = gr.Audio(
155
  label="Upload Reference Audio",
156
- type="filepath"
 
157
  )
158
  reference_text = gr.Textbox(
159
  label="Reference Text (leave blank for auto-transcription)",
@@ -194,10 +201,10 @@ with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
194
 
195
  gr.Markdown("""
196
  ### Optimization Notes:
197
- - Using INT8 quantization for efficient CPU usage
198
- - Optimized audio preprocessing
199
- - Cached model loading
200
  - Memory-efficient inference
 
201
 
202
  ### Tips for best results:
203
  1. Use clear, high-quality reference audio
 
1
  import gradio as gr
2
  import torch
 
3
  import os
4
  from outetts.v0_1.interface import InterfaceHF
5
  import soundfile as sf
 
14
  class OptimizedTTSInterface:
15
  def __init__(self, model_name="OuteAI/OuteTTS-0.1-350M"):
16
  self.interface = InterfaceHF(model_name)
17
+ # Apply FP16 optimization where possible
18
+ self.interface.model = self.interface.model.half().float()
19
+ # Cache commonly used attributes
20
+ self.tokenizer = self.interface.model.tokenizer
 
 
 
21
 
22
  def create_speaker(self, *args, **kwargs):
23
  with torch.inference_mode():
 
29
 
30
  def initialize_models():
31
  """Initialize the OptimizedTTS and Faster-Whisper models"""
32
+ # Create cache directory for models
33
  cache_dir = Path("model_cache")
34
  cache_dir.mkdir(exist_ok=True)
35
 
36
+ # Set environment variables for better performance
37
+ os.environ['OMP_NUM_THREADS'] = '4'
38
+ os.environ['MKL_NUM_THREADS'] = '4'
39
 
40
+ print("Loading ASR model...")
41
  asr_model = WhisperModel("tiny",
42
  device="cpu",
43
  compute_type="int8",
44
  num_workers=1,
45
  cpu_threads=2,
46
  download_root=str(cache_dir))
47
+
48
+ print("Loading TTS model...")
49
+ tts_interface = OptimizedTTSInterface()
50
+
51
  return tts_interface, asr_model
52
 
53
  def transcribe_audio(audio_path):
 
81
  if len(data.shape) > 1:
82
  data = data.mean(axis=1)
83
 
84
+ # Normalize audio
85
+ data = data / max(abs(data.max()), abs(data.min()))
86
+
87
  # Save preprocessed audio
88
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
89
  sf.write(temp_file.name, data, sr)
 
104
  return None, reference_text
105
 
106
  # Create speaker from reference audio
107
+ with torch.inference_mode():
108
+ speaker = TTS_INTERFACE.create_speaker(
109
+ processed_audio,
110
+ reference_text
111
+ )
112
 
113
+ # Generate speech with cloned voice
114
+ output = TTS_INTERFACE.generate(
115
+ text=text_to_speak,
116
+ speaker=speaker,
117
+ temperature=temperature,
118
+ repetition_penalty=repetition_penalty,
119
+ max_lenght=4096
120
+ )
121
 
122
  # Clean up preprocessed audio if it was created
123
  if processed_audio != audio_path:
 
139
  pass
140
  return None, f"Error: {str(e)}"
141
 
142
+ print("Starting initialization...")
143
  # Initialize models globally
144
  TTS_INTERFACE, ASR_MODEL = initialize_models()
145
+ print("Models initialized successfully!")
146
 
147
  # Create Gradio interface
148
  with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
 
152
  Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription),
153
  and enter the new text you want to be spoken in the cloned voice.
154
 
155
+ Note: First run may take longer while models are being cached.
156
  """)
157
 
158
  with gr.Row():
159
  with gr.Column():
160
  audio_input = gr.Audio(
161
  label="Upload Reference Audio",
162
+ type="filepath",
163
+ source="microphone"
164
  )
165
  reference_text = gr.Textbox(
166
  label="Reference Text (leave blank for auto-transcription)",
 
201
 
202
  gr.Markdown("""
203
  ### Optimization Notes:
204
+ - Optimized for CPU performance
205
+ - Model caching enabled
 
206
  - Memory-efficient inference
207
+ - Automatic audio preprocessing
208
 
209
  ### Tips for best results:
210
  1. Use clear, high-quality reference audio