Files changed (1) hide show
  1. app.py +105 -130
app.py CHANGED
@@ -1,7 +1,7 @@
1
  """
2
- MediaTek BreezyVoice 混合版本
3
- HF 下載模型 + GitHub clone 推論程式碼
4
- 尊重作者設計,同時在 HF 上顯示模型引用
5
  """
6
 
7
  import gradio as gr
@@ -15,53 +15,53 @@ import subprocess
15
  import sys
16
  from pathlib import Path
17
 
18
- # 設置單線程模式避免多進程衝突
19
  torch.set_num_threads(1)
20
  os.environ['OMP_NUM_THREADS'] = '1'
21
  os.environ['MKL_NUM_THREADS'] = '1'
22
  os.environ['TOKENIZERS_PARALLELISM'] = 'false'
23
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
24
 
25
- # 預設參考語音範例 (20秒朗讀)
26
- DEFAULT_REFERENCE_TEXT = "台灣是個美麗的島嶼,擁有豐富的自然景觀和多元的文化特色。從北部的陽明山到南部的墾丁,每個地方都有獨特的魅力。四季分明的氣候讓這裡的生活充滿變化,春天櫻花盛開,夏天海灘戲水,秋天楓葉飄香,冬天溫泉暖身。"
27
 
28
- # 全域變數
29
  cosyvoice = None
30
  setup_completed = False
31
 
32
  @spaces.GPU(duration=300)
33
  def setup_breezyvoice_hybrid():
34
- """設置 BreezyVoice 混合版本 - HF 模型 + GitHub 程式碼"""
35
  global cosyvoice, setup_completed
36
 
37
  if setup_completed:
38
- return "✅ BreezyVoice 混合版已準備就緒"
39
 
40
  try:
41
- print("🔧 正在設置 BreezyVoice 混合版本...")
42
- print("📊 策略: HF 下載模型 + GitHub 下載推論程式碼")
43
 
44
- # 1. HuggingFace 下載模型 (尊重作者,顯示模型引用)
45
  try:
46
  from huggingface_hub import snapshot_download
47
- print("📥 HuggingFace 下載 MediaTek-Research/BreezyVoice 模型...")
48
 
49
  model_path = snapshot_download(
50
  "MediaTek-Research/BreezyVoice",
51
  cache_dir="/tmp/hf_models",
52
  local_files_only=False
53
  )
54
- print(f"✅ HF 模型下載完成: {model_path}")
55
 
56
  except Exception as e:
57
- print(f"⚠️ HF 模型下載失敗: {e}")
58
- print("🔄 改用 GitHub 直接下載模型...")
59
- model_path = "MediaTek-Research/BreezyVoice" # 備用方案
60
 
61
- # 2. GitHub 下載推論程式碼 (作者設計的使用方式)
62
  repo_path = "/tmp/BreezyVoice"
63
  if not os.path.exists(repo_path):
64
- print("📥 GitHub 下載 BreezyVoice 推論程式碼...")
65
  result = subprocess.run([
66
  "git", "clone",
67
  "https://github.com/mtkresearch/BreezyVoice.git",
@@ -69,48 +69,48 @@ def setup_breezyvoice_hybrid():
69
  ], capture_output=True, text=True, timeout=300)
70
 
71
  if result.returncode != 0:
72
- raise Exception(f"程式碼下載失敗: {result.stderr}")
73
 
74
- # 3. 添加模組路徑
75
  sys.path.insert(0, repo_path)
76
 
77
- # 4. 導入作者設計的核心模組
78
  try:
79
  from single_inference import CustomCosyVoice
80
- print("✅ BreezyVoice 核心模組導入成功")
81
  except ImportError as e:
82
- raise Exception(f"模組導入失敗: {e}")
83
 
84
- # 5. 初始化模型 (混合方式:HF 模型路徑 + GitHub 程式碼)
85
- print(f"🔄 初始化混合版本模型...")
86
- print(f"📍 模型路徑: {model_path}")
87
 
88
  cosyvoice = CustomCosyVoice(model_path)
89
 
90
  setup_completed = True
91
- print("✅ BreezyVoice 混合版設置完成!")
92
 
93
- # 檢查 VRAM 使用
94
  if torch.cuda.is_available():
95
  vram_used = torch.cuda.memory_allocated() / 1024**3
96
- return f"✅ BreezyVoice 混合版設置完成!\n📊 模型來源: HuggingFace MediaTek-Research/BreezyVoice\n🔧 推論程式: GitHub mtkresearch/BreezyVoice\n💾 VRAM 使用: {vram_used:.2f}GB"
97
 
98
- return "✅ BreezyVoice 混合版設置完成!"
99
 
100
  except Exception as e:
101
- print(f"❌ 設置失敗: {str(e)}")
102
- return f"❌ 設置失敗: {str(e)}"
103
 
104
  @spaces.GPU(duration=180)
105
  def breezy_voice_clone_hybrid(speaker_audio, content_text, speaker_transcription=None):
106
- """執行 BreezyVoice 語音克隆 - 混合版本"""
107
  global cosyvoice
108
 
109
  if speaker_audio is None:
110
- return None, "❌ 請先上傳或錄製參考語音"
111
 
112
  if not content_text.strip():
113
- return None, "❌ 請輸入要合成的文字"
114
 
115
  if not setup_completed or cosyvoice is None:
116
  setup_status = setup_breezyvoice_hybrid()
@@ -119,118 +119,109 @@ def breezy_voice_clone_hybrid(speaker_audio, content_text, speaker_transcription
119
 
120
  try:
121
  with tempfile.TemporaryDirectory() as temp_dir:
122
- # 處理輸入音訊
123
  input_audio_path = os.path.join(temp_dir, "speaker_voice.wav")
124
  output_audio_path = os.path.join(temp_dir, "cloned_voice.wav")
125
 
126
- # 保存參考音訊
127
  sample_rate, audio_data = speaker_audio
128
  torchaudio.save(input_audio_path, torch.tensor(audio_data).unsqueeze(0), sample_rate)
129
 
130
- # 使用參考轉錄或預設值
131
  if not speaker_transcription or not speaker_transcription.strip():
132
  speaker_transcription = DEFAULT_REFERENCE_TEXT
133
 
134
- print(f"🎤 合成文字: {content_text}")
135
- print(f"📝 參考轉錄: {speaker_transcription}")
136
 
137
- # 執行語音合成 - 混合版本
138
  synthesis_start = time.time()
139
 
140
  try:
141
- # 載入音訊為 16kHz
142
  from cosyvoice.utils.file_utils import load_wav
143
  prompt_speech_16k = load_wav(input_audio_path, 16000)
144
 
145
- print("🔄 執行混合版語音合成推論...")
146
 
147
- # 使用作者設計的 no_normalize 版本 (與本地測試一致)
148
  output = cosyvoice.inference_zero_shot_no_normalize(
149
  content_text,
150
  speaker_transcription,
151
  prompt_speech_16k
152
  )
153
 
154
- # 保存輸出音訊
155
  if output is not None and 'tts_speech' in output:
156
  tts_speech = output['tts_speech']
157
  torchaudio.save(output_audio_path, tts_speech, 22050)
158
 
159
  synthesis_time = time.time() - synthesis_start
160
 
161
- # 檢查輸出
162
  if os.path.exists(output_audio_path):
163
- # 讀取合成的音訊
164
  synthesized_audio, file_sample_rate = torchaudio.load(output_audio_path)
165
  synthesized_audio = synthesized_audio.numpy()
166
 
167
- # 計算音訊長度
168
  audio_duration = synthesized_audio.shape[1] / file_sample_rate
169
  rtf = synthesis_time / audio_duration if audio_duration > 0 else float('inf')
170
 
171
- # 檢查 VRAM 使用
172
  vram_info = ""
173
  if torch.cuda.is_available():
174
  vram_used = torch.cuda.memory_allocated() / 1024**3
175
  vram_info = f"💾 VRAM: {vram_used:.2f}GB"
176
 
177
- status = f"""✅ 混合版語音克隆成功!
178
 
179
- 🎙️ 參考語音: {len(audio_data)/sample_rate:.1f}
180
- 📝 合成內容: {content_text}
181
- 📝 使用轉錄: {speaker_transcription[:30]}...
182
- ⏱️ 合成時間: {synthesis_time:.1f}
183
- 🎵 輸出長度: {audio_duration:.1f}
184
- 📊 RTF: {rtf:.3f} {'(實時)' if rtf < 1.0 else '(非實時)'}
185
  {vram_info}
186
- 🤖 模型: MediaTek BreezyVoice 混合版
187
- 📊 模型來源: HuggingFace MediaTek-Research/BreezyVoice
188
- 🔧 推論程式: GitHub mtkresearch/BreezyVoice"""
189
 
190
  return (file_sample_rate, synthesized_audio[0]), status
191
  else:
192
- return None, "❌ 語音合成失敗:未生成輸出檔案"
193
 
194
  except Exception as e:
195
  import traceback
196
  traceback.print_exc()
197
- return None, f"❌ 語音合成失敗: {str(e)}"
198
 
199
  except Exception as e:
200
  import traceback
201
  traceback.print_exc()
202
- return None, f"❌ 處理錯誤: {str(e)}"
203
 
204
  def load_example_text():
205
- """載入預設範例文字"""
206
  return DEFAULT_REFERENCE_TEXT
207
 
208
- # 創建 Gradio 界面
209
- with gr.Blocks(title="BreezyVoice 混合版", theme=gr.themes.Soft()) as demo:
210
- gr.Markdown("# 🎭 MediaTek BreezyVoice 混合版")
211
- gr.Markdown("**零樣本語音克隆系統** - 專為台灣繁體中文優化 (混合版)")
212
- gr.Markdown("📊 **技術架構**: HuggingFace 模型 + GitHub 推論程式碼")
213
 
214
- # 初始化狀態顯示
215
  setup_status = gr.Textbox(
216
- label="🔧 系統狀態",
217
- value="⏳ 準備初始化 BreezyVoice 混合版...",
218
  interactive=False
219
  )
220
 
221
- # 初始化按鈕
222
- init_btn = gr.Button("🚀 初始化 BreezyVoice 混合版", variant="primary")
223
 
224
  with gr.Row():
225
  with gr.Column(scale=1):
226
- gr.Markdown("### 🎙️ 步驟 1: 上傳參考語音")
227
- gr.Markdown("請照著下面的範例文字朗讀,上傳 5-20 秒清晰語音")
228
 
229
- # 顯示範例文字
230
- gr.Markdown("#### 📖 建議朗讀範例:")
231
  example_display = gr.Textbox(
232
  value=DEFAULT_REFERENCE_TEXT,
233
- label="請照著這段文字朗讀 (20)",
234
  lines=4,
235
  interactive=False
236
  )
@@ -238,90 +229,74 @@ with gr.Blocks(title="BreezyVoice 混合版", theme=gr.themes.Soft()) as demo:
238
  speaker_audio = gr.Audio(
239
  sources=["microphone", "upload"],
240
  type="numpy",
241
- label="參考語音錄音 (照著上面文字念)"
242
  )
243
 
244
- gr.Markdown("### 📝 步驟 2: 輸入合成文字")
245
  content_text = gr.Textbox(
246
  lines=3,
247
- placeholder="請輸入要用克隆聲音說出的內容...",
248
- label="合成文字內容",
249
- value="歡迎來到我們的語音合成��統!這個技術可以模仿任何人的聲音,讓文字轉換成自然流暢的語音。"
250
  )
251
 
252
- gr.Markdown("### 🔤 步驟 3: 參考語音轉錄")
253
  speaker_transcription = gr.Textbox(
254
  lines=3,
255
- label="參考語音轉錄 (預設範例)",
256
  value=DEFAULT_REFERENCE_TEXT
257
  )
258
 
259
- # 載入範例按鈕
260
- load_example_btn = gr.Button("📄 載入預設範例", variant="secondary")
261
-
262
- clone_btn = gr.Button("🎭 開始混合版語音克隆", variant="primary", size="lg")
263
 
264
  with gr.Column(scale=1):
265
- gr.Markdown("### 🎵 克隆結果")
266
 
267
  result_audio = gr.Audio(
268
- label="克隆的語音",
269
  type="numpy"
270
  )
271
 
272
  result_status = gr.Textbox(
273
- label="📋 處理狀態",
274
  lines=15,
275
  max_lines=20,
276
  interactive=False
277
  )
278
 
279
- # 使用說明
280
- with gr.Accordion("📖 使用說明", open=False):
281
  gr.Markdown(f"""
282
- ## 🎯 最佳使用方式
283
- 1. **📖 朗讀範例**: 請照著範例文字清晰朗讀
284
- 2. **🎙️ 錄音要求**: 5-20 秒,環境安靜,發音清楚
285
- 3. **✨ 克隆效果**: 系統會用您的聲音說出任何文字
286
 
287
- ## 📝 範例文字內容
288
  ```
289
  {DEFAULT_REFERENCE_TEXT}
290
  ```
291
 
292
- ## ⚡ 技術特色
293
- - 🇹🇼 台灣繁體中文專門優化
294
- - 🎯 零樣本克隆(無需訓練)
295
- - ⚡ ZeroGPU 加速處理
296
- - 🔊 MediaTek 先進語音合成技術
297
- - 🤗 HuggingFace 模型引用 + GitHub 推論程式碼
298
 
299
- ## 💡 混合版本優勢
300
- - **模型來源**: 直接引用 HuggingFace MediaTek-Research/BreezyVoice
301
- - **推論程式**: 使用作者原始 GitHub mtkresearch/BreezyVoice
302
- - **最佳實踐**: 尊重作者設計,同時展示模型使用
303
- - **技術穩定**: 結合官方模型和原始推論程式碼
304
 
305
- ## 🙏 致謝
306
- 感謝 MediaTek Research 團隊開發 BreezyVoice 模型
307
  """)
308
 
309
- # 事件綁定
310
- init_btn.click(
311
- fn=setup_breezyvoice_hybrid,
312
- outputs=[setup_status]
313
- )
314
-
315
- load_example_btn.click(
316
- fn=load_example_text,
317
- outputs=[speaker_transcription]
318
- )
319
-
320
- clone_btn.click(
321
- fn=breezy_voice_clone_hybrid,
322
- inputs=[speaker_audio, content_text, speaker_transcription],
323
- outputs=[result_audio, result_status]
324
- )
325
 
326
  if __name__ == "__main__":
327
- demo.launch()
 
1
  """
2
+ MediaTek BreezyVoice Hybrid Version
3
+ HF downloads the model + GitHub clone for inference code
4
+ Respect the author’s design and display model citation on HF
5
  """
6
 
7
  import gradio as gr
 
15
  import sys
16
  from pathlib import Path
17
 
18
+ # Set single-thread mode to avoid multiprocessing conflicts
19
  torch.set_num_threads(1)
20
  os.environ['OMP_NUM_THREADS'] = '1'
21
  os.environ['MKL_NUM_THREADS'] = '1'
22
  os.environ['TOKENIZERS_PARALLELISM'] = 'false'
23
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
24
 
25
+ # Default reference voice sample (about 20 seconds of reading)
26
+ DEFAULT_REFERENCE_TEXT = "Taiwan is a beautiful island with rich natural scenery and diverse cultural characteristics. From Yangmingshan in the north to Kenting in the south, each place has its own unique charm. The four distinct seasons make life here full of variety—cherry blossoms in spring, beach fun in summer, maple leaves in autumn, and hot springs in winter."
27
 
28
+ # Global variables
29
  cosyvoice = None
30
  setup_completed = False
31
 
32
  @spaces.GPU(duration=300)
33
  def setup_breezyvoice_hybrid():
34
+ """Setup BreezyVoice Hybrid Version - HF model + GitHub code"""
35
  global cosyvoice, setup_completed
36
 
37
  if setup_completed:
38
+ return "✅ BreezyVoice Hybrid version is ready"
39
 
40
  try:
41
+ print("🔧 Setting up BreezyVoice Hybrid Version...")
42
+ print("📊 Strategy: Download model from HF + Download inference code from GitHub")
43
 
44
+ # 1. Download model from HuggingFace (respecting the author with citation)
45
  try:
46
  from huggingface_hub import snapshot_download
47
+ print("📥 Downloading MediaTek-Research/BreezyVoice model from HuggingFace...")
48
 
49
  model_path = snapshot_download(
50
  "MediaTek-Research/BreezyVoice",
51
  cache_dir="/tmp/hf_models",
52
  local_files_only=False
53
  )
54
+ print(f"✅ HF model downloaded: {model_path}")
55
 
56
  except Exception as e:
57
+ print(f"⚠️ HF model download failed: {e}")
58
+ print("🔄 Falling back to direct GitHub model path...")
59
+ model_path = "MediaTek-Research/BreezyVoice" # fallback
60
 
61
+ # 2. Download inference code from GitHub (as designed by the author)
62
  repo_path = "/tmp/BreezyVoice"
63
  if not os.path.exists(repo_path):
64
+ print("📥 Downloading BreezyVoice inference code from GitHub...")
65
  result = subprocess.run([
66
  "git", "clone",
67
  "https://github.com/mtkresearch/BreezyVoice.git",
 
69
  ], capture_output=True, text=True, timeout=300)
70
 
71
  if result.returncode != 0:
72
+ raise Exception(f"Code download failed: {result.stderr}")
73
 
74
+ # 3. Add module path
75
  sys.path.insert(0, repo_path)
76
 
77
+ # 4. Import core module from author
78
  try:
79
  from single_inference import CustomCosyVoice
80
+ print("✅ BreezyVoice core module imported successfully")
81
  except ImportError as e:
82
+ raise Exception(f"Module import failed: {e}")
83
 
84
+ # 5. Initialize model (hybrid: HF model path + GitHub code)
85
+ print(f"🔄 Initializing hybrid model...")
86
+ print(f"📍 Model path: {model_path}")
87
 
88
  cosyvoice = CustomCosyVoice(model_path)
89
 
90
  setup_completed = True
91
+ print("✅ BreezyVoice hybrid setup complete!")
92
 
93
+ # Check VRAM usage
94
  if torch.cuda.is_available():
95
  vram_used = torch.cuda.memory_allocated() / 1024**3
96
+ return f"✅ BreezyVoice Hybrid setup complete!\n📊 Model source: HuggingFace MediaTek-Research/BreezyVoice\n🔧 Inference code: GitHub mtkresearch/BreezyVoice\n💾 VRAM usage: {vram_used:.2f}GB"
97
 
98
+ return "✅ BreezyVoice Hybrid setup complete!"
99
 
100
  except Exception as e:
101
+ print(f"❌ Setup failed: {str(e)}")
102
+ return f"❌ Setup failed: {str(e)}"
103
 
104
  @spaces.GPU(duration=180)
105
  def breezy_voice_clone_hybrid(speaker_audio, content_text, speaker_transcription=None):
106
+ """Run BreezyVoice voice cloning - hybrid version"""
107
  global cosyvoice
108
 
109
  if speaker_audio is None:
110
+ return None, "❌ Please upload or record a reference voice first"
111
 
112
  if not content_text.strip():
113
+ return None, "❌ Please enter the text to synthesize"
114
 
115
  if not setup_completed or cosyvoice is None:
116
  setup_status = setup_breezyvoice_hybrid()
 
119
 
120
  try:
121
  with tempfile.TemporaryDirectory() as temp_dir:
122
+ # Handle input audio
123
  input_audio_path = os.path.join(temp_dir, "speaker_voice.wav")
124
  output_audio_path = os.path.join(temp_dir, "cloned_voice.wav")
125
 
126
+ # Save reference audio
127
  sample_rate, audio_data = speaker_audio
128
  torchaudio.save(input_audio_path, torch.tensor(audio_data).unsqueeze(0), sample_rate)
129
 
130
+ # Use provided or default transcription
131
  if not speaker_transcription or not speaker_transcription.strip():
132
  speaker_transcription = DEFAULT_REFERENCE_TEXT
133
 
134
+ print(f"🎤 Synthesizing text: {content_text}")
135
+ print(f"📝 Reference transcription: {speaker_transcription}")
136
 
137
+ # Run synthesis - hybrid version
138
  synthesis_start = time.time()
139
 
140
  try:
 
141
  from cosyvoice.utils.file_utils import load_wav
142
  prompt_speech_16k = load_wav(input_audio_path, 16000)
143
 
144
+ print("🔄 Running hybrid voice synthesis inference...")
145
 
146
+ # Use no_normalize version (as in local testing)
147
  output = cosyvoice.inference_zero_shot_no_normalize(
148
  content_text,
149
  speaker_transcription,
150
  prompt_speech_16k
151
  )
152
 
 
153
  if output is not None and 'tts_speech' in output:
154
  tts_speech = output['tts_speech']
155
  torchaudio.save(output_audio_path, tts_speech, 22050)
156
 
157
  synthesis_time = time.time() - synthesis_start
158
 
 
159
  if os.path.exists(output_audio_path):
 
160
  synthesized_audio, file_sample_rate = torchaudio.load(output_audio_path)
161
  synthesized_audio = synthesized_audio.numpy()
162
 
 
163
  audio_duration = synthesized_audio.shape[1] / file_sample_rate
164
  rtf = synthesis_time / audio_duration if audio_duration > 0 else float('inf')
165
 
 
166
  vram_info = ""
167
  if torch.cuda.is_available():
168
  vram_used = torch.cuda.memory_allocated() / 1024**3
169
  vram_info = f"💾 VRAM: {vram_used:.2f}GB"
170
 
171
+ status = f"""✅ Hybrid voice cloning successful!
172
 
173
+ 🎙️ Reference audio: {len(audio_data)/sample_rate:.1f}s
174
+ 📝 Synthesized content: {content_text}
175
+ 📝 Used transcription: {speaker_transcription[:30]}...
176
+ ⏱️ Synthesis time: {synthesis_time:.1f}s
177
+ 🎵 Output length: {audio_duration:.1f}s
178
+ 📊 RTF: {rtf:.3f} {'(real-time)' if rtf < 1.0 else '(non real-time)'}
179
  {vram_info}
180
+ 🤖 Model: MediaTek BreezyVoice Hybrid
181
+ 📊 Model source: HuggingFace MediaTek-Research/BreezyVoice
182
+ 🔧 Inference code: GitHub mtkresearch/BreezyVoice"""
183
 
184
  return (file_sample_rate, synthesized_audio[0]), status
185
  else:
186
+ return None, "❌ Synthesis failed: No output file generated"
187
 
188
  except Exception as e:
189
  import traceback
190
  traceback.print_exc()
191
+ return None, f"❌ Synthesis failed: {str(e)}"
192
 
193
  except Exception as e:
194
  import traceback
195
  traceback.print_exc()
196
+ return None, f"❌ Processing error: {str(e)}"
197
 
198
  def load_example_text():
199
+ """Load default example text"""
200
  return DEFAULT_REFERENCE_TEXT
201
 
202
+ # Build Gradio interface
203
+ with gr.Blocks(title="BreezyVoice Hybrid", theme=gr.themes.Soft()) as demo:
204
+ gr.Markdown("# 🎭 MediaTek BreezyVoice Hybrid")
205
+ gr.Markdown("**Zero-shot voice cloning system** optimized for Traditional Chinese (Taiwan)")
206
+ gr.Markdown("📊 **Architecture**: HuggingFace model + GitHub inference code")
207
 
 
208
  setup_status = gr.Textbox(
209
+ label="🔧 System Status",
210
+ value="⏳ Preparing BreezyVoice Hybrid initialization...",
211
  interactive=False
212
  )
213
 
214
+ init_btn = gr.Button("🚀 Initialize BreezyVoice Hybrid", variant="primary")
 
215
 
216
  with gr.Row():
217
  with gr.Column(scale=1):
218
+ gr.Markdown("### 🎙️ Step 1: Upload Reference Voice")
219
+ gr.Markdown("Please read the following sample text aloud and upload a 520 second clear voice recording")
220
 
221
+ gr.Markdown("#### 📖 Recommended Reading Sample:")
 
222
  example_display = gr.Textbox(
223
  value=DEFAULT_REFERENCE_TEXT,
224
+ label="Read this text aloud (around 20 seconds)",
225
  lines=4,
226
  interactive=False
227
  )
 
229
  speaker_audio = gr.Audio(
230
  sources=["microphone", "upload"],
231
  type="numpy",
232
+ label="Reference voice recording (read the text above)"
233
  )
234
 
235
+ gr.Markdown("### 📝 Step 2: Enter Text to Synthesize")
236
  content_text = gr.Textbox(
237
  lines=3,
238
+ placeholder="Enter the text you want spoken in the cloned voice...",
239
+ label="Synthesis text",
240
+ value="Welcome to our voice synthesis system! This technology can mimic anyone’s voice and convert text into natural, fluent speech."
241
  )
242
 
243
+ gr.Markdown("### 🔤 Step 3: Reference Voice Transcription")
244
  speaker_transcription = gr.Textbox(
245
  lines=3,
246
+ label="Reference transcription (default example)",
247
  value=DEFAULT_REFERENCE_TEXT
248
  )
249
 
250
+ load_example_btn = gr.Button("📄 Load Default Example", variant="secondary")
251
+ clone_btn = gr.Button("🎭 Start Hybrid Voice Cloning", variant="primary", size="lg")
 
 
252
 
253
  with gr.Column(scale=1):
254
+ gr.Markdown("### 🎵 Cloning Result")
255
 
256
  result_audio = gr.Audio(
257
+ label="Cloned Voice",
258
  type="numpy"
259
  )
260
 
261
  result_status = gr.Textbox(
262
+ label="📋 Processing Status",
263
  lines=15,
264
  max_lines=20,
265
  interactive=False
266
  )
267
 
268
+ with gr.Accordion("📖 User Guide", open=False):
 
269
  gr.Markdown(f"""
270
+ ## 🎯 Best Practices
271
+ 1. **📖 Read the sample clearly**
272
+ 2. **🎙️ Recording requirements**: 520 seconds, quiet environment, clear pronunciation
273
+ 3. **✨ Cloning effect**: The system will use your voice to speak any text
274
 
275
+ ## 📝 Sample Text
276
  ```
277
  {DEFAULT_REFERENCE_TEXT}
278
  ```
279
 
280
+ ## ⚡ Technical Highlights
281
+ - 🇹🇼 Optimized for Traditional Chinese (Taiwan)
282
+ - 🎯 Zero-shot cloning (no training required)
283
+ - ⚡ ZeroGPU accelerated processing
284
+ - 🔊 MediaTek advanced speech synthesis
285
+ - 🤗 HuggingFace model citation + GitHub inference code
286
 
287
+ ## 💡 Advantages of the Hybrid Version
288
+ - **Model Source**: HuggingFace MediaTek-Research/BreezyVoice
289
+ - **Inference Code**: GitHub mtkresearch/BreezyVoice
290
+ - **Best Practice**: Respect author design and display model usage
291
+ - **Stability**: Combines official model with original inference code
292
 
293
+ ## 🙏 Acknowledgment
294
+ Thanks to MediaTek Research for developing the BreezyVoice model.
295
  """)
296
 
297
+ init_btn.click(fn=setup_breezyvoice_hybrid, outputs=[setup_status])
298
+ load_example_btn.click(fn=load_example_text, outputs=[speaker_transcription])
299
+ clone_btn.click(fn=breezy_voice_clone_hybrid, inputs=[speaker_audio, content_text, speaker_transcription], outputs=[result_audio, result_status])
 
 
 
 
 
 
 
 
 
 
 
 
 
300
 
301
  if __name__ == "__main__":
302
+ demo.launch()