Spaces:
Sleeping
Sleeping
Update app.py
#1
by tahirturk - opened
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
"""
|
| 2 |
-
MediaTek BreezyVoice
|
| 3 |
-
HF
|
| 4 |
-
|
| 5 |
"""
|
| 6 |
|
| 7 |
import gradio as gr
|
|
@@ -15,53 +15,53 @@ import subprocess
|
|
| 15 |
import sys
|
| 16 |
from pathlib import Path
|
| 17 |
|
| 18 |
-
#
|
| 19 |
torch.set_num_threads(1)
|
| 20 |
os.environ['OMP_NUM_THREADS'] = '1'
|
| 21 |
os.environ['MKL_NUM_THREADS'] = '1'
|
| 22 |
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
| 23 |
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
| 24 |
|
| 25 |
-
#
|
| 26 |
-
DEFAULT_REFERENCE_TEXT = "
|
| 27 |
|
| 28 |
-
#
|
| 29 |
cosyvoice = None
|
| 30 |
setup_completed = False
|
| 31 |
|
| 32 |
@spaces.GPU(duration=300)
|
| 33 |
def setup_breezyvoice_hybrid():
|
| 34 |
-
"""
|
| 35 |
global cosyvoice, setup_completed
|
| 36 |
|
| 37 |
if setup_completed:
|
| 38 |
-
return "✅ BreezyVoice
|
| 39 |
|
| 40 |
try:
|
| 41 |
-
print("🔧
|
| 42 |
-
print("📊
|
| 43 |
|
| 44 |
-
# 1.
|
| 45 |
try:
|
| 46 |
from huggingface_hub import snapshot_download
|
| 47 |
-
print("📥
|
| 48 |
|
| 49 |
model_path = snapshot_download(
|
| 50 |
"MediaTek-Research/BreezyVoice",
|
| 51 |
cache_dir="/tmp/hf_models",
|
| 52 |
local_files_only=False
|
| 53 |
)
|
| 54 |
-
print(f"✅ HF
|
| 55 |
|
| 56 |
except Exception as e:
|
| 57 |
-
print(f"⚠️ HF
|
| 58 |
-
print("🔄
|
| 59 |
-
model_path = "MediaTek-Research/BreezyVoice" #
|
| 60 |
|
| 61 |
-
# 2.
|
| 62 |
repo_path = "/tmp/BreezyVoice"
|
| 63 |
if not os.path.exists(repo_path):
|
| 64 |
-
print("📥
|
| 65 |
result = subprocess.run([
|
| 66 |
"git", "clone",
|
| 67 |
"https://github.com/mtkresearch/BreezyVoice.git",
|
|
@@ -69,48 +69,48 @@ def setup_breezyvoice_hybrid():
|
|
| 69 |
], capture_output=True, text=True, timeout=300)
|
| 70 |
|
| 71 |
if result.returncode != 0:
|
| 72 |
-
raise Exception(f"
|
| 73 |
|
| 74 |
-
# 3.
|
| 75 |
sys.path.insert(0, repo_path)
|
| 76 |
|
| 77 |
-
# 4.
|
| 78 |
try:
|
| 79 |
from single_inference import CustomCosyVoice
|
| 80 |
-
print("✅ BreezyVoice
|
| 81 |
except ImportError as e:
|
| 82 |
-
raise Exception(f"
|
| 83 |
|
| 84 |
-
# 5.
|
| 85 |
-
print(f"🔄
|
| 86 |
-
print(f"📍
|
| 87 |
|
| 88 |
cosyvoice = CustomCosyVoice(model_path)
|
| 89 |
|
| 90 |
setup_completed = True
|
| 91 |
-
print("✅ BreezyVoice
|
| 92 |
|
| 93 |
-
#
|
| 94 |
if torch.cuda.is_available():
|
| 95 |
vram_used = torch.cuda.memory_allocated() / 1024**3
|
| 96 |
-
return f"✅ BreezyVoice
|
| 97 |
|
| 98 |
-
return "✅ BreezyVoice
|
| 99 |
|
| 100 |
except Exception as e:
|
| 101 |
-
print(f"❌
|
| 102 |
-
return f"❌
|
| 103 |
|
| 104 |
@spaces.GPU(duration=180)
|
| 105 |
def breezy_voice_clone_hybrid(speaker_audio, content_text, speaker_transcription=None):
|
| 106 |
-
"""
|
| 107 |
global cosyvoice
|
| 108 |
|
| 109 |
if speaker_audio is None:
|
| 110 |
-
return None, "❌
|
| 111 |
|
| 112 |
if not content_text.strip():
|
| 113 |
-
return None, "❌
|
| 114 |
|
| 115 |
if not setup_completed or cosyvoice is None:
|
| 116 |
setup_status = setup_breezyvoice_hybrid()
|
|
@@ -119,118 +119,109 @@ def breezy_voice_clone_hybrid(speaker_audio, content_text, speaker_transcription
|
|
| 119 |
|
| 120 |
try:
|
| 121 |
with tempfile.TemporaryDirectory() as temp_dir:
|
| 122 |
-
#
|
| 123 |
input_audio_path = os.path.join(temp_dir, "speaker_voice.wav")
|
| 124 |
output_audio_path = os.path.join(temp_dir, "cloned_voice.wav")
|
| 125 |
|
| 126 |
-
#
|
| 127 |
sample_rate, audio_data = speaker_audio
|
| 128 |
torchaudio.save(input_audio_path, torch.tensor(audio_data).unsqueeze(0), sample_rate)
|
| 129 |
|
| 130 |
-
#
|
| 131 |
if not speaker_transcription or not speaker_transcription.strip():
|
| 132 |
speaker_transcription = DEFAULT_REFERENCE_TEXT
|
| 133 |
|
| 134 |
-
print(f"🎤
|
| 135 |
-
print(f"📝
|
| 136 |
|
| 137 |
-
#
|
| 138 |
synthesis_start = time.time()
|
| 139 |
|
| 140 |
try:
|
| 141 |
-
# 載入音訊為 16kHz
|
| 142 |
from cosyvoice.utils.file_utils import load_wav
|
| 143 |
prompt_speech_16k = load_wav(input_audio_path, 16000)
|
| 144 |
|
| 145 |
-
print("🔄
|
| 146 |
|
| 147 |
-
#
|
| 148 |
output = cosyvoice.inference_zero_shot_no_normalize(
|
| 149 |
content_text,
|
| 150 |
speaker_transcription,
|
| 151 |
prompt_speech_16k
|
| 152 |
)
|
| 153 |
|
| 154 |
-
# 保存輸出音訊
|
| 155 |
if output is not None and 'tts_speech' in output:
|
| 156 |
tts_speech = output['tts_speech']
|
| 157 |
torchaudio.save(output_audio_path, tts_speech, 22050)
|
| 158 |
|
| 159 |
synthesis_time = time.time() - synthesis_start
|
| 160 |
|
| 161 |
-
# 檢查輸出
|
| 162 |
if os.path.exists(output_audio_path):
|
| 163 |
-
# 讀取合成的音訊
|
| 164 |
synthesized_audio, file_sample_rate = torchaudio.load(output_audio_path)
|
| 165 |
synthesized_audio = synthesized_audio.numpy()
|
| 166 |
|
| 167 |
-
# 計算音訊長度
|
| 168 |
audio_duration = synthesized_audio.shape[1] / file_sample_rate
|
| 169 |
rtf = synthesis_time / audio_duration if audio_duration > 0 else float('inf')
|
| 170 |
|
| 171 |
-
# 檢查 VRAM 使用
|
| 172 |
vram_info = ""
|
| 173 |
if torch.cuda.is_available():
|
| 174 |
vram_used = torch.cuda.memory_allocated() / 1024**3
|
| 175 |
vram_info = f"💾 VRAM: {vram_used:.2f}GB"
|
| 176 |
|
| 177 |
-
status = f"""✅
|
| 178 |
|
| 179 |
-
🎙️
|
| 180 |
-
📝
|
| 181 |
-
📝
|
| 182 |
-
⏱️
|
| 183 |
-
🎵
|
| 184 |
-
📊 RTF: {rtf:.3f} {'(
|
| 185 |
{vram_info}
|
| 186 |
-
🤖
|
| 187 |
-
📊
|
| 188 |
-
🔧
|
| 189 |
|
| 190 |
return (file_sample_rate, synthesized_audio[0]), status
|
| 191 |
else:
|
| 192 |
-
return None, "❌
|
| 193 |
|
| 194 |
except Exception as e:
|
| 195 |
import traceback
|
| 196 |
traceback.print_exc()
|
| 197 |
-
return None, f"❌
|
| 198 |
|
| 199 |
except Exception as e:
|
| 200 |
import traceback
|
| 201 |
traceback.print_exc()
|
| 202 |
-
return None, f"❌
|
| 203 |
|
| 204 |
def load_example_text():
|
| 205 |
-
"""
|
| 206 |
return DEFAULT_REFERENCE_TEXT
|
| 207 |
|
| 208 |
-
#
|
| 209 |
-
with gr.Blocks(title="BreezyVoice
|
| 210 |
-
gr.Markdown("# 🎭 MediaTek BreezyVoice
|
| 211 |
-
gr.Markdown("**
|
| 212 |
-
gr.Markdown("📊 **
|
| 213 |
|
| 214 |
-
# 初始化狀態顯示
|
| 215 |
setup_status = gr.Textbox(
|
| 216 |
-
label="🔧
|
| 217 |
-
value="⏳
|
| 218 |
interactive=False
|
| 219 |
)
|
| 220 |
|
| 221 |
-
|
| 222 |
-
init_btn = gr.Button("🚀 初始化 BreezyVoice 混合版", variant="primary")
|
| 223 |
|
| 224 |
with gr.Row():
|
| 225 |
with gr.Column(scale=1):
|
| 226 |
-
gr.Markdown("### 🎙️
|
| 227 |
-
gr.Markdown("
|
| 228 |
|
| 229 |
-
#
|
| 230 |
-
gr.Markdown("#### 📖 建議朗讀範例:")
|
| 231 |
example_display = gr.Textbox(
|
| 232 |
value=DEFAULT_REFERENCE_TEXT,
|
| 233 |
-
label="
|
| 234 |
lines=4,
|
| 235 |
interactive=False
|
| 236 |
)
|
|
@@ -238,90 +229,74 @@ with gr.Blocks(title="BreezyVoice 混合版", theme=gr.themes.Soft()) as demo:
|
|
| 238 |
speaker_audio = gr.Audio(
|
| 239 |
sources=["microphone", "upload"],
|
| 240 |
type="numpy",
|
| 241 |
-
label="
|
| 242 |
)
|
| 243 |
|
| 244 |
-
gr.Markdown("### 📝
|
| 245 |
content_text = gr.Textbox(
|
| 246 |
lines=3,
|
| 247 |
-
placeholder="
|
| 248 |
-
label="
|
| 249 |
-
value="
|
| 250 |
)
|
| 251 |
|
| 252 |
-
gr.Markdown("### 🔤
|
| 253 |
speaker_transcription = gr.Textbox(
|
| 254 |
lines=3,
|
| 255 |
-
label="
|
| 256 |
value=DEFAULT_REFERENCE_TEXT
|
| 257 |
)
|
| 258 |
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
clone_btn = gr.Button("🎭 開始混合版語音克隆", variant="primary", size="lg")
|
| 263 |
|
| 264 |
with gr.Column(scale=1):
|
| 265 |
-
gr.Markdown("### 🎵
|
| 266 |
|
| 267 |
result_audio = gr.Audio(
|
| 268 |
-
label="
|
| 269 |
type="numpy"
|
| 270 |
)
|
| 271 |
|
| 272 |
result_status = gr.Textbox(
|
| 273 |
-
label="📋
|
| 274 |
lines=15,
|
| 275 |
max_lines=20,
|
| 276 |
interactive=False
|
| 277 |
)
|
| 278 |
|
| 279 |
-
|
| 280 |
-
with gr.Accordion("📖 使用說明", open=False):
|
| 281 |
gr.Markdown(f"""
|
| 282 |
-
## 🎯
|
| 283 |
-
1. **📖
|
| 284 |
-
2. **🎙️
|
| 285 |
-
3. **✨
|
| 286 |
|
| 287 |
-
## 📝
|
| 288 |
```
|
| 289 |
{DEFAULT_REFERENCE_TEXT}
|
| 290 |
```
|
| 291 |
|
| 292 |
-
## ⚡
|
| 293 |
-
- 🇹🇼
|
| 294 |
-
- 🎯
|
| 295 |
-
- ⚡ ZeroGPU
|
| 296 |
-
- 🔊 MediaTek
|
| 297 |
-
- 🤗 HuggingFace
|
| 298 |
|
| 299 |
-
## 💡
|
| 300 |
-
- **
|
| 301 |
-
- **
|
| 302 |
-
- **
|
| 303 |
-
- **
|
| 304 |
|
| 305 |
-
## 🙏
|
| 306 |
-
|
| 307 |
""")
|
| 308 |
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
outputs=[setup_status]
|
| 313 |
-
)
|
| 314 |
-
|
| 315 |
-
load_example_btn.click(
|
| 316 |
-
fn=load_example_text,
|
| 317 |
-
outputs=[speaker_transcription]
|
| 318 |
-
)
|
| 319 |
-
|
| 320 |
-
clone_btn.click(
|
| 321 |
-
fn=breezy_voice_clone_hybrid,
|
| 322 |
-
inputs=[speaker_audio, content_text, speaker_transcription],
|
| 323 |
-
outputs=[result_audio, result_status]
|
| 324 |
-
)
|
| 325 |
|
| 326 |
if __name__ == "__main__":
|
| 327 |
-
demo.launch()
|
|
|
|
| 1 |
"""
|
| 2 |
+
MediaTek BreezyVoice Hybrid Version
|
| 3 |
+
HF downloads the model + GitHub clone for inference code
|
| 4 |
+
Respect the author’s design and display model citation on HF
|
| 5 |
"""
|
| 6 |
|
| 7 |
import gradio as gr
|
|
|
|
| 15 |
import sys
|
| 16 |
from pathlib import Path
|
| 17 |
|
| 18 |
+
# Set single-thread mode to avoid multiprocessing conflicts
|
| 19 |
torch.set_num_threads(1)
|
| 20 |
os.environ['OMP_NUM_THREADS'] = '1'
|
| 21 |
os.environ['MKL_NUM_THREADS'] = '1'
|
| 22 |
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
| 23 |
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
| 24 |
|
| 25 |
+
# Default reference voice sample (about 20 seconds of reading)
|
| 26 |
+
DEFAULT_REFERENCE_TEXT = "Taiwan is a beautiful island with rich natural scenery and diverse cultural characteristics. From Yangmingshan in the north to Kenting in the south, each place has its own unique charm. The four distinct seasons make life here full of variety—cherry blossoms in spring, beach fun in summer, maple leaves in autumn, and hot springs in winter."
|
| 27 |
|
| 28 |
+
# Global variables
|
| 29 |
cosyvoice = None
|
| 30 |
setup_completed = False
|
| 31 |
|
| 32 |
@spaces.GPU(duration=300)
|
| 33 |
def setup_breezyvoice_hybrid():
|
| 34 |
+
"""Setup BreezyVoice Hybrid Version - HF model + GitHub code"""
|
| 35 |
global cosyvoice, setup_completed
|
| 36 |
|
| 37 |
if setup_completed:
|
| 38 |
+
return "✅ BreezyVoice Hybrid version is ready"
|
| 39 |
|
| 40 |
try:
|
| 41 |
+
print("🔧 Setting up BreezyVoice Hybrid Version...")
|
| 42 |
+
print("📊 Strategy: Download model from HF + Download inference code from GitHub")
|
| 43 |
|
| 44 |
+
# 1. Download model from HuggingFace (respecting the author with citation)
|
| 45 |
try:
|
| 46 |
from huggingface_hub import snapshot_download
|
| 47 |
+
print("📥 Downloading MediaTek-Research/BreezyVoice model from HuggingFace...")
|
| 48 |
|
| 49 |
model_path = snapshot_download(
|
| 50 |
"MediaTek-Research/BreezyVoice",
|
| 51 |
cache_dir="/tmp/hf_models",
|
| 52 |
local_files_only=False
|
| 53 |
)
|
| 54 |
+
print(f"✅ HF model downloaded: {model_path}")
|
| 55 |
|
| 56 |
except Exception as e:
|
| 57 |
+
print(f"⚠️ HF model download failed: {e}")
|
| 58 |
+
print("🔄 Falling back to direct GitHub model path...")
|
| 59 |
+
model_path = "MediaTek-Research/BreezyVoice" # fallback
|
| 60 |
|
| 61 |
+
# 2. Download inference code from GitHub (as designed by the author)
|
| 62 |
repo_path = "/tmp/BreezyVoice"
|
| 63 |
if not os.path.exists(repo_path):
|
| 64 |
+
print("📥 Downloading BreezyVoice inference code from GitHub...")
|
| 65 |
result = subprocess.run([
|
| 66 |
"git", "clone",
|
| 67 |
"https://github.com/mtkresearch/BreezyVoice.git",
|
|
|
|
| 69 |
], capture_output=True, text=True, timeout=300)
|
| 70 |
|
| 71 |
if result.returncode != 0:
|
| 72 |
+
raise Exception(f"Code download failed: {result.stderr}")
|
| 73 |
|
| 74 |
+
# 3. Add module path
|
| 75 |
sys.path.insert(0, repo_path)
|
| 76 |
|
| 77 |
+
# 4. Import core module from author
|
| 78 |
try:
|
| 79 |
from single_inference import CustomCosyVoice
|
| 80 |
+
print("✅ BreezyVoice core module imported successfully")
|
| 81 |
except ImportError as e:
|
| 82 |
+
raise Exception(f"Module import failed: {e}")
|
| 83 |
|
| 84 |
+
# 5. Initialize model (hybrid: HF model path + GitHub code)
|
| 85 |
+
print(f"🔄 Initializing hybrid model...")
|
| 86 |
+
print(f"📍 Model path: {model_path}")
|
| 87 |
|
| 88 |
cosyvoice = CustomCosyVoice(model_path)
|
| 89 |
|
| 90 |
setup_completed = True
|
| 91 |
+
print("✅ BreezyVoice hybrid setup complete!")
|
| 92 |
|
| 93 |
+
# Check VRAM usage
|
| 94 |
if torch.cuda.is_available():
|
| 95 |
vram_used = torch.cuda.memory_allocated() / 1024**3
|
| 96 |
+
return f"✅ BreezyVoice Hybrid setup complete!\n📊 Model source: HuggingFace MediaTek-Research/BreezyVoice\n🔧 Inference code: GitHub mtkresearch/BreezyVoice\n💾 VRAM usage: {vram_used:.2f}GB"
|
| 97 |
|
| 98 |
+
return "✅ BreezyVoice Hybrid setup complete!"
|
| 99 |
|
| 100 |
except Exception as e:
|
| 101 |
+
print(f"❌ Setup failed: {str(e)}")
|
| 102 |
+
return f"❌ Setup failed: {str(e)}"
|
| 103 |
|
| 104 |
@spaces.GPU(duration=180)
|
| 105 |
def breezy_voice_clone_hybrid(speaker_audio, content_text, speaker_transcription=None):
|
| 106 |
+
"""Run BreezyVoice voice cloning - hybrid version"""
|
| 107 |
global cosyvoice
|
| 108 |
|
| 109 |
if speaker_audio is None:
|
| 110 |
+
return None, "❌ Please upload or record a reference voice first"
|
| 111 |
|
| 112 |
if not content_text.strip():
|
| 113 |
+
return None, "❌ Please enter the text to synthesize"
|
| 114 |
|
| 115 |
if not setup_completed or cosyvoice is None:
|
| 116 |
setup_status = setup_breezyvoice_hybrid()
|
|
|
|
| 119 |
|
| 120 |
try:
|
| 121 |
with tempfile.TemporaryDirectory() as temp_dir:
|
| 122 |
+
# Handle input audio
|
| 123 |
input_audio_path = os.path.join(temp_dir, "speaker_voice.wav")
|
| 124 |
output_audio_path = os.path.join(temp_dir, "cloned_voice.wav")
|
| 125 |
|
| 126 |
+
# Save reference audio
|
| 127 |
sample_rate, audio_data = speaker_audio
|
| 128 |
torchaudio.save(input_audio_path, torch.tensor(audio_data).unsqueeze(0), sample_rate)
|
| 129 |
|
| 130 |
+
# Use provided or default transcription
|
| 131 |
if not speaker_transcription or not speaker_transcription.strip():
|
| 132 |
speaker_transcription = DEFAULT_REFERENCE_TEXT
|
| 133 |
|
| 134 |
+
print(f"🎤 Synthesizing text: {content_text}")
|
| 135 |
+
print(f"📝 Reference transcription: {speaker_transcription}")
|
| 136 |
|
| 137 |
+
# Run synthesis - hybrid version
|
| 138 |
synthesis_start = time.time()
|
| 139 |
|
| 140 |
try:
|
|
|
|
| 141 |
from cosyvoice.utils.file_utils import load_wav
|
| 142 |
prompt_speech_16k = load_wav(input_audio_path, 16000)
|
| 143 |
|
| 144 |
+
print("🔄 Running hybrid voice synthesis inference...")
|
| 145 |
|
| 146 |
+
# Use no_normalize version (as in local testing)
|
| 147 |
output = cosyvoice.inference_zero_shot_no_normalize(
|
| 148 |
content_text,
|
| 149 |
speaker_transcription,
|
| 150 |
prompt_speech_16k
|
| 151 |
)
|
| 152 |
|
|
|
|
| 153 |
if output is not None and 'tts_speech' in output:
|
| 154 |
tts_speech = output['tts_speech']
|
| 155 |
torchaudio.save(output_audio_path, tts_speech, 22050)
|
| 156 |
|
| 157 |
synthesis_time = time.time() - synthesis_start
|
| 158 |
|
|
|
|
| 159 |
if os.path.exists(output_audio_path):
|
|
|
|
| 160 |
synthesized_audio, file_sample_rate = torchaudio.load(output_audio_path)
|
| 161 |
synthesized_audio = synthesized_audio.numpy()
|
| 162 |
|
|
|
|
| 163 |
audio_duration = synthesized_audio.shape[1] / file_sample_rate
|
| 164 |
rtf = synthesis_time / audio_duration if audio_duration > 0 else float('inf')
|
| 165 |
|
|
|
|
| 166 |
vram_info = ""
|
| 167 |
if torch.cuda.is_available():
|
| 168 |
vram_used = torch.cuda.memory_allocated() / 1024**3
|
| 169 |
vram_info = f"💾 VRAM: {vram_used:.2f}GB"
|
| 170 |
|
| 171 |
+
status = f"""✅ Hybrid voice cloning successful!
|
| 172 |
|
| 173 |
+
🎙️ Reference audio: {len(audio_data)/sample_rate:.1f}s
|
| 174 |
+
📝 Synthesized content: {content_text}
|
| 175 |
+
📝 Used transcription: {speaker_transcription[:30]}...
|
| 176 |
+
⏱️ Synthesis time: {synthesis_time:.1f}s
|
| 177 |
+
🎵 Output length: {audio_duration:.1f}s
|
| 178 |
+
📊 RTF: {rtf:.3f} {'(real-time)' if rtf < 1.0 else '(non real-time)'}
|
| 179 |
{vram_info}
|
| 180 |
+
🤖 Model: MediaTek BreezyVoice Hybrid
|
| 181 |
+
📊 Model source: HuggingFace MediaTek-Research/BreezyVoice
|
| 182 |
+
🔧 Inference code: GitHub mtkresearch/BreezyVoice"""
|
| 183 |
|
| 184 |
return (file_sample_rate, synthesized_audio[0]), status
|
| 185 |
else:
|
| 186 |
+
return None, "❌ Synthesis failed: No output file generated"
|
| 187 |
|
| 188 |
except Exception as e:
|
| 189 |
import traceback
|
| 190 |
traceback.print_exc()
|
| 191 |
+
return None, f"❌ Synthesis failed: {str(e)}"
|
| 192 |
|
| 193 |
except Exception as e:
|
| 194 |
import traceback
|
| 195 |
traceback.print_exc()
|
| 196 |
+
return None, f"❌ Processing error: {str(e)}"
|
| 197 |
|
| 198 |
def load_example_text():
|
| 199 |
+
"""Load default example text"""
|
| 200 |
return DEFAULT_REFERENCE_TEXT
|
| 201 |
|
| 202 |
+
# Build Gradio interface
|
| 203 |
+
with gr.Blocks(title="BreezyVoice Hybrid", theme=gr.themes.Soft()) as demo:
|
| 204 |
+
gr.Markdown("# 🎭 MediaTek BreezyVoice Hybrid")
|
| 205 |
+
gr.Markdown("**Zero-shot voice cloning system** — optimized for Traditional Chinese (Taiwan)")
|
| 206 |
+
gr.Markdown("📊 **Architecture**: HuggingFace model + GitHub inference code")
|
| 207 |
|
|
|
|
| 208 |
setup_status = gr.Textbox(
|
| 209 |
+
label="🔧 System Status",
|
| 210 |
+
value="⏳ Preparing BreezyVoice Hybrid initialization...",
|
| 211 |
interactive=False
|
| 212 |
)
|
| 213 |
|
| 214 |
+
init_btn = gr.Button("🚀 Initialize BreezyVoice Hybrid", variant="primary")
|
|
|
|
| 215 |
|
| 216 |
with gr.Row():
|
| 217 |
with gr.Column(scale=1):
|
| 218 |
+
gr.Markdown("### 🎙️ Step 1: Upload Reference Voice")
|
| 219 |
+
gr.Markdown("Please read the following sample text aloud and upload a 5–20 second clear voice recording")
|
| 220 |
|
| 221 |
+
gr.Markdown("#### 📖 Recommended Reading Sample:")
|
|
|
|
| 222 |
example_display = gr.Textbox(
|
| 223 |
value=DEFAULT_REFERENCE_TEXT,
|
| 224 |
+
label="Read this text aloud (around 20 seconds)",
|
| 225 |
lines=4,
|
| 226 |
interactive=False
|
| 227 |
)
|
|
|
|
| 229 |
speaker_audio = gr.Audio(
|
| 230 |
sources=["microphone", "upload"],
|
| 231 |
type="numpy",
|
| 232 |
+
label="Reference voice recording (read the text above)"
|
| 233 |
)
|
| 234 |
|
| 235 |
+
gr.Markdown("### 📝 Step 2: Enter Text to Synthesize")
|
| 236 |
content_text = gr.Textbox(
|
| 237 |
lines=3,
|
| 238 |
+
placeholder="Enter the text you want spoken in the cloned voice...",
|
| 239 |
+
label="Synthesis text",
|
| 240 |
+
value="Welcome to our voice synthesis system! This technology can mimic anyone’s voice and convert text into natural, fluent speech."
|
| 241 |
)
|
| 242 |
|
| 243 |
+
gr.Markdown("### 🔤 Step 3: Reference Voice Transcription")
|
| 244 |
speaker_transcription = gr.Textbox(
|
| 245 |
lines=3,
|
| 246 |
+
label="Reference transcription (default example)",
|
| 247 |
value=DEFAULT_REFERENCE_TEXT
|
| 248 |
)
|
| 249 |
|
| 250 |
+
load_example_btn = gr.Button("📄 Load Default Example", variant="secondary")
|
| 251 |
+
clone_btn = gr.Button("🎭 Start Hybrid Voice Cloning", variant="primary", size="lg")
|
|
|
|
|
|
|
| 252 |
|
| 253 |
with gr.Column(scale=1):
|
| 254 |
+
gr.Markdown("### 🎵 Cloning Result")
|
| 255 |
|
| 256 |
result_audio = gr.Audio(
|
| 257 |
+
label="Cloned Voice",
|
| 258 |
type="numpy"
|
| 259 |
)
|
| 260 |
|
| 261 |
result_status = gr.Textbox(
|
| 262 |
+
label="📋 Processing Status",
|
| 263 |
lines=15,
|
| 264 |
max_lines=20,
|
| 265 |
interactive=False
|
| 266 |
)
|
| 267 |
|
| 268 |
+
with gr.Accordion("📖 User Guide", open=False):
|
|
|
|
| 269 |
gr.Markdown(f"""
|
| 270 |
+
## 🎯 Best Practices
|
| 271 |
+
1. **📖 Read the sample clearly**
|
| 272 |
+
2. **🎙️ Recording requirements**: 5–20 seconds, quiet environment, clear pronunciation
|
| 273 |
+
3. **✨ Cloning effect**: The system will use your voice to speak any text
|
| 274 |
|
| 275 |
+
## 📝 Sample Text
|
| 276 |
```
|
| 277 |
{DEFAULT_REFERENCE_TEXT}
|
| 278 |
```
|
| 279 |
|
| 280 |
+
## ⚡ Technical Highlights
|
| 281 |
+
- 🇹🇼 Optimized for Traditional Chinese (Taiwan)
|
| 282 |
+
- 🎯 Zero-shot cloning (no training required)
|
| 283 |
+
- ⚡ ZeroGPU accelerated processing
|
| 284 |
+
- 🔊 MediaTek advanced speech synthesis
|
| 285 |
+
- 🤗 HuggingFace model citation + GitHub inference code
|
| 286 |
|
| 287 |
+
## 💡 Advantages of the Hybrid Version
|
| 288 |
+
- **Model Source**: HuggingFace MediaTek-Research/BreezyVoice
|
| 289 |
+
- **Inference Code**: GitHub mtkresearch/BreezyVoice
|
| 290 |
+
- **Best Practice**: Respect author design and display model usage
|
| 291 |
+
- **Stability**: Combines official model with original inference code
|
| 292 |
|
| 293 |
+
## 🙏 Acknowledgment
|
| 294 |
+
Thanks to MediaTek Research for developing the BreezyVoice model.
|
| 295 |
""")
|
| 296 |
|
| 297 |
+
init_btn.click(fn=setup_breezyvoice_hybrid, outputs=[setup_status])
|
| 298 |
+
load_example_btn.click(fn=load_example_text, outputs=[speaker_transcription])
|
| 299 |
+
clone_btn.click(fn=breezy_voice_clone_hybrid, inputs=[speaker_audio, content_text, speaker_transcription], outputs=[result_audio, result_status])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
|
| 301 |
if __name__ == "__main__":
|
| 302 |
+
demo.launch()
|