Spaces:
Paused
Paused
Fix CUDA initialization error: move Whisper loading to GPU function
Browse files
app.py
CHANGED
|
@@ -30,7 +30,6 @@ _models_cache = {}
|
|
| 30 |
_tokenizer_cache = None
|
| 31 |
_vocoder_cache = None
|
| 32 |
_feature_extractor_cache = None
|
| 33 |
-
_whisper_model_cache = None
|
| 34 |
|
| 35 |
|
| 36 |
def load_models_and_components(model_name: str):
|
|
@@ -104,26 +103,15 @@ def load_models_and_components(model_name: str):
|
|
| 104 |
model_config["feature"]["sampling_rate"])
|
| 105 |
|
| 106 |
|
| 107 |
-
|
| 108 |
-
"""Load and cache Whisper model for transcription."""
|
| 109 |
-
global _whisper_model_cache
|
| 110 |
-
|
| 111 |
-
if _whisper_model_cache is None:
|
| 112 |
-
print("Loading Whisper model for transcription...")
|
| 113 |
-
# Use base model for faster transcription
|
| 114 |
-
_whisper_model_cache = whisper.load_model("base")
|
| 115 |
-
|
| 116 |
-
return _whisper_model_cache
|
| 117 |
-
|
| 118 |
-
|
| 119 |
def transcribe_audio_whisper(audio_file):
|
| 120 |
"""Transcribe audio file using Whisper."""
|
| 121 |
if audio_file is None:
|
| 122 |
return "Error: Please upload an audio file first."
|
| 123 |
|
| 124 |
try:
|
| 125 |
-
# Load Whisper model
|
| 126 |
-
model =
|
| 127 |
|
| 128 |
# Transcribe the audio
|
| 129 |
result = model.transcribe(audio_file, language="en")
|
|
@@ -244,8 +232,37 @@ def create_gradio_interface():
|
|
| 244 |
gr.HTML("""
|
| 245 |
<div class="title">🎵 ZipVoice</div>
|
| 246 |
<div class="subtitle">Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching</div>
|
| 247 |
-
|
| 248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
</div>
|
| 250 |
""")
|
| 251 |
|
|
|
|
| 30 |
_tokenizer_cache = None
|
| 31 |
_vocoder_cache = None
|
| 32 |
_feature_extractor_cache = None
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def load_models_and_components(model_name: str):
|
|
|
|
| 103 |
model_config["feature"]["sampling_rate"])
|
| 104 |
|
| 105 |
|
| 106 |
+
@spaces.GPU
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
def transcribe_audio_whisper(audio_file):
|
| 108 |
"""Transcribe audio file using Whisper."""
|
| 109 |
if audio_file is None:
|
| 110 |
return "Error: Please upload an audio file first."
|
| 111 |
|
| 112 |
try:
|
| 113 |
+
# Load Whisper model (will be done on GPU)
|
| 114 |
+
model = whisper.load_model("base")
|
| 115 |
|
| 116 |
# Transcribe the audio
|
| 117 |
result = model.transcribe(audio_file, language="en")
|
|
|
|
| 232 |
gr.HTML("""
|
| 233 |
<div class="title">🎵 ZipVoice</div>
|
| 234 |
<div class="subtitle">Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching</div>
|
| 235 |
+
|
| 236 |
+
<div style="background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; padding: 1.5em; margin: 1em 0; font-size: 0.9em;">
|
| 237 |
+
<h3 style="margin-top: 0; color: #1e293b;">📖 How to Use / 使用說明</h3>
|
| 238 |
+
|
| 239 |
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 2em; margin-top: 1em;">
|
| 240 |
+
<div>
|
| 241 |
+
<h4 style="color: #2563eb; margin-bottom: 0.5em;">English / 英文</h4>
|
| 242 |
+
<ol style="margin: 0; padding-left: 1.2em; line-height: 1.6;">
|
| 243 |
+
<li><b>Upload Audio:</b> Choose a short audio clip (1-3 seconds) of the voice you want to clone</li>
|
| 244 |
+
<li><b>Transcribe:</b> Click "🎤 Transcribe Audio" to get automatic transcription</li>
|
| 245 |
+
<li><b>Enter Text:</b> Type the text you want to convert to speech</li>
|
| 246 |
+
<li><b>Choose Model:</b> Select ZipVoice (better quality) or ZipVoice Distill (faster)</li>
|
| 247 |
+
<li><b>Adjust Speed:</b> Modify speech speed (0.5 = slower, 2.0 = faster)</li>
|
| 248 |
+
<li><b>Generate:</b> Click "🎵 Generate Speech" to create your audio</li>
|
| 249 |
+
</ol>
|
| 250 |
+
<p style="margin-top: 1em; color: #64748b;"><b>Tips:</b> Use clear audio with minimal background noise for best results.</p>
|
| 251 |
+
</div>
|
| 252 |
+
|
| 253 |
+
<div>
|
| 254 |
+
<h4 style="color: #2563eb; margin-bottom: 0.5em;">繁體中文 / Traditional Chinese</h4>
|
| 255 |
+
<ol style="margin: 0; padding-left: 1.2em; line-height: 1.6;">
|
| 256 |
+
<li><b>上傳音頻:</b>選擇一個簡短的音頻片段(1-3秒)作為要克隆的聲音</li>
|
| 257 |
+
<li><b>轉錄音頻:</b>點擊「🎤 Transcribe Audio」按鈕進行自動轉錄</li>
|
| 258 |
+
<li><b>輸入文字:</b>輸入您要轉換成語音的文字</li>
|
| 259 |
+
<li><b>選擇模型:</b>選擇 ZipVoice(品質較好)或 ZipVoice Distill(速度較快)</li>
|
| 260 |
+
<li><b>調整速度:</b>修改語音速度(0.5 = 較慢,2.0 = 較快)</li>
|
| 261 |
+
<li><b>生成語音:</b>點擊「🎵 Generate Speech」生成音頻</li>
|
| 262 |
+
</ol>
|
| 263 |
+
<p style="margin-top: 1em; color: #64748b;"><b>提示:</b>使用清晰且背景噪音少的音頻以獲得最佳效果。</p>
|
| 264 |
+
</div>
|
| 265 |
+
</div>
|
| 266 |
</div>
|
| 267 |
""")
|
| 268 |
|