add MaskGCT voice cloning option
Browse files
webgui.py
CHANGED
|
@@ -212,6 +212,22 @@ def process_video(uploaded_img, uploaded_audio, width, height, length, seed, fac
|
|
| 212 |
video_clip.write_videofile(str(final_output_path), codec="libx264", audio_codec="aac")
|
| 213 |
|
| 214 |
return final_output_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
with gr.Blocks() as demo:
|
| 217 |
gr.Markdown('# EchoMimic')
|
|
@@ -228,6 +244,20 @@ with gr.Blocks() as demo:
|
|
| 228 |
with gr.Column():
|
| 229 |
uploaded_img = gr.Image(type="filepath", label="Reference Image")
|
| 230 |
uploaded_audio = gr.Audio(type="filepath", label="Input Audio")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
with gr.Accordion(label=advanced_settings_label, open=False):
|
| 232 |
with gr.Row():
|
| 233 |
width = gr.Slider(label="Width", minimum=128, maximum=1024, value=default_values["width"], interactive=available_property)
|
|
@@ -297,6 +327,14 @@ with gr.Blocks() as demo:
|
|
| 297 |
output_video= final_output_path
|
| 298 |
return final_output_path
|
| 299 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
generate_button.click(
|
| 301 |
generate_video,
|
| 302 |
inputs=[
|
|
|
|
| 212 |
video_clip.write_videofile(str(final_output_path), codec="libx264", audio_codec="aac")
|
| 213 |
|
| 214 |
return final_output_path
|
| 215 |
+
|
| 216 |
+
def get_maskGCT_TTS(prompt_audio_maskGCT, audio_to_clone):
|
| 217 |
+
try:
|
| 218 |
+
client = Client("amphion/maskgct")
|
| 219 |
+
except:
|
| 220 |
+
raise gr.Error(f"amphion/maskgct space's api might not be ready, please wait, or upload an audio instead.")
|
| 221 |
+
|
| 222 |
+
result = client.predict(
|
| 223 |
+
prompt_wav = handle_file(audio_to_clone),
|
| 224 |
+
target_text = prompt_audio_maskGCT,
|
| 225 |
+
target_len=-1,
|
| 226 |
+
n_timesteps=25,
|
| 227 |
+
api_name="/predict"
|
| 228 |
+
)
|
| 229 |
+
print(result)
|
| 230 |
+
return result, gr.update(value=result, visible=True)
|
| 231 |
|
| 232 |
with gr.Blocks() as demo:
|
| 233 |
gr.Markdown('# EchoMimic')
|
|
|
|
| 244 |
with gr.Column():
|
| 245 |
uploaded_img = gr.Image(type="filepath", label="Reference Image")
|
| 246 |
uploaded_audio = gr.Audio(type="filepath", label="Input Audio")
|
| 247 |
+
preprocess_audio_file = gr.File(visible=False)
|
| 248 |
+
with gr.Accordion(label="Voice cloning with MaskGCT", open=False):
|
| 249 |
+
prompt_audio_maskGCT = gr.Textbox(
|
| 250 |
+
label = "Text to synthetize",
|
| 251 |
+
lines = 2,
|
| 252 |
+
max_lines = 2,
|
| 253 |
+
elem_id = "text-synth-maskGCT"
|
| 254 |
+
)
|
| 255 |
+
audio_to_clone_maskGCT = gr.Audio(
|
| 256 |
+
label = "Voice to clone",
|
| 257 |
+
type = "filepath",
|
| 258 |
+
elem_id = "audio-clone-elm-maskGCT"
|
| 259 |
+
)
|
| 260 |
+
gen_maskGCT_voice_btn = gr.Button("Generate voice clone (optional)")
|
| 261 |
with gr.Accordion(label=advanced_settings_label, open=False):
|
| 262 |
with gr.Row():
|
| 263 |
width = gr.Slider(label="Width", minimum=128, maximum=1024, value=default_values["width"], interactive=available_property)
|
|
|
|
| 327 |
output_video= final_output_path
|
| 328 |
return final_output_path
|
| 329 |
|
| 330 |
+
gen_maskGCT_voice_btn.click(
|
| 331 |
+
fn = get_maskGCT_TTS,
|
| 332 |
+
inputs = [prompt_audio_maskGCT, audio_to_clone_maskGCT],
|
| 333 |
+
outputs = [voice, preprocess_audio_file],
|
| 334 |
+
queue = False,
|
| 335 |
+
show_api = False
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
generate_button.click(
|
| 339 |
generate_video,
|
| 340 |
inputs=[
|