Luigi commited on
Commit
da07fc5
·
1 Parent(s): 1b73690

Fix CUDA initialization error: move Whisper loading to GPU function

Browse files
Files changed (1) hide show
  1. app.py +34 -17
app.py CHANGED
@@ -30,7 +30,6 @@ _models_cache = {}
30
  _tokenizer_cache = None
31
  _vocoder_cache = None
32
  _feature_extractor_cache = None
33
- _whisper_model_cache = None
34
 
35
 
36
  def load_models_and_components(model_name: str):
@@ -104,26 +103,15 @@ def load_models_and_components(model_name: str):
104
  model_config["feature"]["sampling_rate"])
105
 
106
 
107
- def load_whisper_model():
108
- """Load and cache Whisper model for transcription."""
109
- global _whisper_model_cache
110
-
111
- if _whisper_model_cache is None:
112
- print("Loading Whisper model for transcription...")
113
- # Use base model for faster transcription
114
- _whisper_model_cache = whisper.load_model("base")
115
-
116
- return _whisper_model_cache
117
-
118
-
119
  def transcribe_audio_whisper(audio_file):
120
  """Transcribe audio file using Whisper."""
121
  if audio_file is None:
122
  return "Error: Please upload an audio file first."
123
 
124
  try:
125
- # Load Whisper model
126
- model = load_whisper_model()
127
 
128
  # Transcribe the audio
129
  result = model.transcribe(audio_file, language="en")
@@ -244,8 +232,37 @@ def create_gradio_interface():
244
  gr.HTML("""
245
  <div class="title">🎵 ZipVoice</div>
246
  <div class="subtitle">Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching</div>
247
- <div style="text-align: center; color: #64748b; font-size: 0.9em; margin-bottom: 1em;">
248
- Upload audio, click "Transcribe Audio" to get automatic transcription, then generate speech in that voice!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  </div>
250
  """)
251
 
 
30
  _tokenizer_cache = None
31
  _vocoder_cache = None
32
  _feature_extractor_cache = None
 
33
 
34
 
35
  def load_models_and_components(model_name: str):
 
103
  model_config["feature"]["sampling_rate"])
104
 
105
 
106
+ @spaces.GPU
 
 
 
 
 
 
 
 
 
 
 
107
  def transcribe_audio_whisper(audio_file):
108
  """Transcribe audio file using Whisper."""
109
  if audio_file is None:
110
  return "Error: Please upload an audio file first."
111
 
112
  try:
113
+ # Load Whisper model (will be done on GPU)
114
+ model = whisper.load_model("base")
115
 
116
  # Transcribe the audio
117
  result = model.transcribe(audio_file, language="en")
 
232
  gr.HTML("""
233
  <div class="title">🎵 ZipVoice</div>
234
  <div class="subtitle">Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching</div>
235
+
236
+ <div style="background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; padding: 1.5em; margin: 1em 0; font-size: 0.9em;">
237
+ <h3 style="margin-top: 0; color: #1e293b;">📖 How to Use / 使用說明</h3>
238
+
239
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 2em; margin-top: 1em;">
240
+ <div>
241
+ <h4 style="color: #2563eb; margin-bottom: 0.5em;">English / 英文</h4>
242
+ <ol style="margin: 0; padding-left: 1.2em; line-height: 1.6;">
243
+ <li><b>Upload Audio:</b> Choose a short audio clip (1-3 seconds) of the voice you want to clone</li>
244
+ <li><b>Transcribe:</b> Click "🎤 Transcribe Audio" to get automatic transcription</li>
245
+ <li><b>Enter Text:</b> Type the text you want to convert to speech</li>
246
+ <li><b>Choose Model:</b> Select ZipVoice (better quality) or ZipVoice Distill (faster)</li>
247
+ <li><b>Adjust Speed:</b> Modify speech speed (0.5 = slower, 2.0 = faster)</li>
248
+ <li><b>Generate:</b> Click "🎵 Generate Speech" to create your audio</li>
249
+ </ol>
250
+ <p style="margin-top: 1em; color: #64748b;"><b>Tips:</b> Use clear audio with minimal background noise for best results.</p>
251
+ </div>
252
+
253
+ <div>
254
+ <h4 style="color: #2563eb; margin-bottom: 0.5em;">繁體中文 / Traditional Chinese</h4>
255
+ <ol style="margin: 0; padding-left: 1.2em; line-height: 1.6;">
256
+ <li><b>上傳音頻:</b>選擇一個簡短的音頻片段(1-3秒)作為要克隆的聲音</li>
257
+ <li><b>轉錄音頻:</b>點擊「🎤 Transcribe Audio」按鈕進行自動轉錄</li>
258
+ <li><b>輸入文字:</b>輸入您要轉換成語音的文字</li>
259
+ <li><b>選擇模型:</b>選擇 ZipVoice(品質較好)或 ZipVoice Distill(速度較快)</li>
260
+ <li><b>調整速度:</b>修改語音速度(0.5 = 較慢,2.0 = 較快)</li>
261
+ <li><b>生成語音:</b>點擊「🎵 Generate Speech」生成音頻</li>
262
+ </ol>
263
+ <p style="margin-top: 1em; color: #64748b;"><b>提示:</b>使用清晰且背景噪音少的音頻以獲得最佳效果。</p>
264
+ </div>
265
+ </div>
266
  </div>
267
  """)
268