akhaliq HF Staff commited on
Commit
67d30f5
·
verified ·
1 Parent(s): d9396d8

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +204 -0
app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import soundfile as sf
3
+ import numpy as np
4
+ from voxcpm import VoxCPM
5
+ import tempfile
6
+ import os
7
+ import spaces
8
+
9
+ # Load the model once at startup
10
+ model = VoxCPM.from_pretrained("openbmb/VoxCPM-0.5B")
11
+
12
+ @spaces.GPU(duration=120)
13
+ def generate_speech(
14
+ text,
15
+ prompt_audio,
16
+ prompt_text,
17
+ cfg_value,
18
+ inference_timesteps,
19
+ normalize,
20
+ denoise,
21
+ retry_badcase,
22
+ retry_badcase_max_times,
23
+ retry_badcase_ratio_threshold
24
+ ):
25
+ if not text:
26
+ gr.Warning("Please enter text to generate speech")
27
+ return None
28
+
29
+ # Handle prompt audio if provided
30
+ prompt_wav_path = None
31
+ if prompt_audio is not None:
32
+ prompt_wav_path = prompt_audio
33
+
34
+ # Handle empty prompt text
35
+ if prompt_text and prompt_text.strip() == "":
36
+ prompt_text = None
37
+
38
+ try:
39
+ # Generate speech
40
+ wav = model.generate(
41
+ text=text,
42
+ prompt_wav_path=prompt_wav_path,
43
+ prompt_text=prompt_text,
44
+ cfg_value=cfg_value,
45
+ inference_timesteps=int(inference_timesteps),
46
+ normalize=normalize,
47
+ denoise=denoise,
48
+ retry_badcase=retry_badcase,
49
+ retry_badcase_max_times=int(retry_badcase_max_times),
50
+ retry_badcase_ratio_threshold=retry_badcase_ratio_threshold
51
+ )
52
+
53
+ # Create temporary file for audio output
54
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
55
+ sf.write(tmp_file.name, wav, 16000)
56
+ return tmp_file.name
57
+
58
+ except Exception as e:
59
+ gr.Error(f"Error generating speech: {str(e)}")
60
+ return None
61
+
62
+ # Create Gradio interface
63
+ with gr.Blocks(title="VoxCPM Text-to-Speech") as demo:
64
+ gr.Markdown(
65
+ """
66
+ # 🎙️ VoxCPM Text-to-Speech
67
+
68
+ Generate highly expressive speech using VoxCPM-0.5B model. Optionally clone voices by providing reference audio.
69
+
70
+ [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
71
+ """
72
+ )
73
+
74
+ with gr.Row():
75
+ with gr.Column(scale=1):
76
+ # Input section
77
+ text_input = gr.Textbox(
78
+ label="Text to Synthesize",
79
+ placeholder="Enter the text you want to convert to speech...",
80
+ lines=3,
81
+ value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech."
82
+ )
83
+
84
+ with gr.Accordion("Voice Cloning (Optional)", open=False):
85
+ prompt_audio = gr.Audio(
86
+ label="Reference Audio",
87
+ type="filepath",
88
+ sources=["upload"],
89
+ info="Upload a reference audio file for voice cloning"
90
+ )
91
+ prompt_text = gr.Textbox(
92
+ label="Reference Text",
93
+ placeholder="Text corresponding to the reference audio (optional)",
94
+ lines=2
95
+ )
96
+
97
+ with gr.Accordion("Advanced Settings", open=False):
98
+ cfg_value = gr.Slider(
99
+ minimum=0.5,
100
+ maximum=5.0,
101
+ value=2.0,
102
+ step=0.1,
103
+ label="CFG Value",
104
+ info="LM guidance on LocDiT, higher for better adherence to prompt"
105
+ )
106
+
107
+ inference_timesteps = gr.Slider(
108
+ minimum=5,
109
+ maximum=50,
110
+ value=10,
111
+ step=1,
112
+ label="Inference Timesteps",
113
+ info="Higher for better quality, lower for faster speed"
114
+ )
115
+
116
+ with gr.Row():
117
+ normalize = gr.Checkbox(
118
+ value=True,
119
+ label="Normalize",
120
+ info="Enable external TN tool"
121
+ )
122
+ denoise = gr.Checkbox(
123
+ value=True,
124
+ label="Denoise",
125
+ info="Enable external Denoise tool"
126
+ )
127
+ retry_badcase = gr.Checkbox(
128
+ value=True,
129
+ label="Retry Bad Cases",
130
+ info="Enable retrying for bad cases"
131
+ )
132
+
133
+ with gr.Row():
134
+ retry_badcase_max_times = gr.Number(
135
+ value=3,
136
+ minimum=1,
137
+ maximum=10,
138
+ step=1,
139
+ label="Max Retry Times"
140
+ )
141
+ retry_badcase_ratio_threshold = gr.Number(
142
+ value=6.0,
143
+ minimum=1.0,
144
+ maximum=10.0,
145
+ step=0.5,
146
+ label="Retry Ratio Threshold"
147
+ )
148
+
149
+ generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
150
+
151
+ with gr.Column(scale=1):
152
+ # Output section
153
+ audio_output = gr.Audio(
154
+ label="Generated Speech",
155
+ type="filepath",
156
+ autoplay=False
157
+ )
158
+
159
+ gr.Markdown(
160
+ """
161
+ ### Tips:
162
+ - For voice cloning, upload a clear reference audio (3-10 seconds recommended)
163
+ - Higher CFG values provide better prompt adherence but may affect naturalness
164
+ - Increase inference timesteps for better quality at the cost of speed
165
+ - The retry mechanism helps handle edge cases automatically
166
+ """
167
+ )
168
+
169
+ # Examples
170
+ gr.Examples(
171
+ examples=[
172
+ ["Hello! Welcome to the VoxCPM text-to-speech demonstration. This model can generate highly expressive and natural-sounding speech.", None, None, 2.0, 10],
173
+ ["The quick brown fox jumps over the lazy dog. This pangram contains all letters of the alphabet.", None, None, 2.5, 15],
174
+ ["Artificial intelligence is transforming the way we interact with technology, making it more natural and intuitive.", None, None, 2.0, 10],
175
+ ],
176
+ inputs=[text_input, prompt_audio, prompt_text, cfg_value, inference_timesteps],
177
+ outputs=audio_output,
178
+ fn=lambda t, pa, pt, cfg, its: generate_speech(
179
+ t, pa, pt, cfg, its, True, True, True, 3, 6.0
180
+ ),
181
+ cache_examples=True,
182
+ cache_mode="lazy"
183
+ )
184
+
185
+ # Connect the generate button
186
+ generate_btn.click(
187
+ fn=generate_speech,
188
+ inputs=[
189
+ text_input,
190
+ prompt_audio,
191
+ prompt_text,
192
+ cfg_value,
193
+ inference_timesteps,
194
+ normalize,
195
+ denoise,
196
+ retry_badcase,
197
+ retry_badcase_max_times,
198
+ retry_badcase_ratio_threshold
199
+ ],
200
+ outputs=audio_output,
201
+ show_progress="full"
202
+ )
203
+
204
+ demo.launch()