Spaces:

Luigi
/

ZipVoice-DEMO

Paused

App Files Files Community

Luigi commited on Sep 25

Commit

da07fc5

1 Parent(s): 1b73690

Fix CUDA initialization error: move Whisper loading to GPU function

Browse files

Files changed (1) hide show

app.py +34 -17

app.py CHANGED Viewed

@@ -30,7 +30,6 @@ _models_cache = {}
 _tokenizer_cache = None
 _vocoder_cache = None
 _feature_extractor_cache = None
-_whisper_model_cache = None
 def load_models_and_components(model_name: str):
@@ -104,26 +103,15 @@ def load_models_and_components(model_name: str):
             model_config["feature"]["sampling_rate"])
-def load_whisper_model():
-    """Load and cache Whisper model for transcription."""
-    global _whisper_model_cache
-    if _whisper_model_cache is None:
-        print("Loading Whisper model for transcription...")
-        # Use base model for faster transcription
-        _whisper_model_cache = whisper.load_model("base")
-    return _whisper_model_cache
 def transcribe_audio_whisper(audio_file):
     """Transcribe audio file using Whisper."""
     if audio_file is None:
         return "Error: Please upload an audio file first."
     try:
-        # Load Whisper model
-        model = load_whisper_model()
         # Transcribe the audio
         result = model.transcribe(audio_file, language="en")
@@ -244,8 +232,37 @@ def create_gradio_interface():
         gr.HTML("""
         <div class="title">🎵 ZipVoice</div>
         <div class="subtitle">Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching</div>
-        <div style="text-align: center; color: #64748b; font-size: 0.9em; margin-bottom: 1em;">
-            Upload audio, click "Transcribe Audio" to get automatic transcription, then generate speech in that voice!
         </div>
         """)

 _tokenizer_cache = None
 _vocoder_cache = None
 _feature_extractor_cache = None
 def load_models_and_components(model_name: str):
             model_config["feature"]["sampling_rate"])
+@spaces.GPU
 def transcribe_audio_whisper(audio_file):
     """Transcribe audio file using Whisper."""
     if audio_file is None:
         return "Error: Please upload an audio file first."
     try:
+        # Load Whisper model (will be done on GPU)
+        model = whisper.load_model("base")
         # Transcribe the audio
         result = model.transcribe(audio_file, language="en")
         gr.HTML("""
         <div class="title">🎵 ZipVoice</div>
         <div class="subtitle">Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching</div>
+        <div style="background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; padding: 1.5em; margin: 1em 0; font-size: 0.9em;">
+            <h3 style="margin-top: 0; color: #1e293b;">📖 How to Use / 使用說明</h3>
+            <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 2em; margin-top: 1em;">
+                <div>
+                    <h4 style="color: #2563eb; margin-bottom: 0.5em;">English / 英文</h4>
+                    <ol style="margin: 0; padding-left: 1.2em; line-height: 1.6;">
+                        <li><b>Upload Audio:</b> Choose a short audio clip (1-3 seconds) of the voice you want to clone</li>
+                        <li><b>Transcribe:</b> Click "🎤 Transcribe Audio" to get automatic transcription</li>
+                        <li><b>Enter Text:</b> Type the text you want to convert to speech</li>
+                        <li><b>Choose Model:</b> Select ZipVoice (better quality) or ZipVoice Distill (faster)</li>
+                        <li><b>Adjust Speed:</b> Modify speech speed (0.5 = slower, 2.0 = faster)</li>
+                        <li><b>Generate:</b> Click "🎵 Generate Speech" to create your audio</li>
+                    </ol>
+                    <p style="margin-top: 1em; color: #64748b;"><b>Tips:</b> Use clear audio with minimal background noise for best results.</p>
+                </div>
+                <div>
+                    <h4 style="color: #2563eb; margin-bottom: 0.5em;">繁體中文 / Traditional Chinese</h4>
+                    <ol style="margin: 0; padding-left: 1.2em; line-height: 1.6;">
+                        <li><b>上傳音頻：</b>選擇一個簡短的音頻片段（1-3秒）作為要克隆的聲音</li>
+                        <li><b>轉錄音頻：</b>點擊「🎤 Transcribe Audio」按鈕進行自動轉錄</li>
+                        <li><b>輸入文字：</b>輸入您要轉換成語音的文字</li>
+                        <li><b>選擇模型：</b>選擇 ZipVoice（品質較好）或 ZipVoice Distill（速度較快）</li>
+                        <li><b>調整速度：</b>修改語音速度（0.5 = 較慢，2.0 = 較快）</li>
+                        <li><b>生成語音：</b>點擊「🎵 Generate Speech」生成音頻</li>
+                    </ol>
+                    <p style="margin-top: 1em; color: #64748b;"><b>提示：</b>使用清晰且背景噪音少的音頻以獲得最佳效果。</p>
+                </div>
+            </div>
         </div>
         """)