import gradio as gr import subprocess from moviepy.editor import VideoFileClip def convert_to_mp4_with_aac(input_path, output_path): # Load the video video = VideoFileClip(input_path) # Set the output format to mp4 with AAC codec video.write_videofile(output_path, codec="libx264", audio_codec="aac") return output_path def load_audio(audio_listed): return f"data/audio/{audio_listed}" def execute_command(command: str) -> None: subprocess.run(command, check=True) def infer(audio_input, image_path, emotional_style): output_name = "lipsynced_result" command = [ f"python", f"inference_for_demo_video.py", f"--wav_path={audio_input}", f"--style_clip_path=data/style_clip/3DMM/{emotional_style}", f"--pose_path=data/pose/RichardShelby_front_neutral_level1_001.mat", f"--image_path={image_path}", f"--cfg_scale=1.0", f"--max_gen_len=30", f"--output_name={output_name}" ] execute_command(command) # Convert video to compatible codecs input_file = f"output_video/{output_name}.mp4" output_file = f"{output_name}.mp4" result = convert_to_mp4_with_aac(input_file, output_file) return result css=""" #col-container{ margin: 0 auto; max-width: 940px; } """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.HTML("""

DreamTalk: When Expressive Talking Head Generation Meets Diffusion Probabilistic Models

DreamTalk is a diffusion-based audio-driven expressive talking head generation framework that can produce high-quality talking head videos across diverse speaking styles. DreamTalk exhibits robust performance with a diverse array of inputs, including songs, speech in multiple languages, noisy audio, and out-of-domain portraits.

""") with gr.Row(): with gr.Column(): image_path = gr.Image(label="Image", type="filepath", sources=["upload"]) audio_input = gr.Audio(label="Audio input", type="filepath", sources=["upload"], value="data/audio/acknowledgement_english.m4a") with gr.Row(): audio_list = gr.Dropdown( label="Choose an audio (optional)", choices=[ "German1.wav", "German2.wav", "German3.wav", "German4.wav", "acknowledgement_chinese.m4a", "acknowledgement_english.m4a", "chinese1_haierlizhi.wav", "chinese2_guanyu.wav", "french1.wav", "french2.wav", "french3.wav", "italian1.wav", "italian2.wav", "italian3.wav", "japan1.wav", "japan2.wav", "japan3.wav", "korean1.wav", "korean2.wav", "korean3.wav", "noisy_audio_cafeter_snr_0.wav", "noisy_audio_meeting_snr_0.wav", "noisy_audio_meeting_snr_10.wav", "noisy_audio_meeting_snr_20.wav", "noisy_audio_narrative.wav", "noisy_audio_office_snr_0.wav", "out_of_domain_narrative.wav", "spanish1.wav", "spanish2.wav", "spanish3.wav" ], value = "acknowledgement_english.m4a" ) audio_list.change( fn = load_audio, inputs = [audio_list], outputs = [audio_input] ) emotional_style = gr.Dropdown( label = "emotional style", choices = [ "M030_front_angry_level3_001.mat", "M030_front_contempt_level3_001.mat", "M030_front_disgusted_level3_001.mat", "M030_front_fear_level3_001.mat", "M030_front_happy_level3_001.mat", "M030_front_neutral_level1_001.mat", "M030_front_sad_level3_001.mat", "M030_front_surprised_level3_001.mat", "W009_front_angry_level3_001.mat", "W009_front_contempt_level3_001.mat", "W009_front_disgusted_level3_001.mat", "W009_front_fear_level3_001.mat", "W009_front_happy_level3_001.mat", "W009_front_neutral_level1_001.mat", "W009_front_sad_level3_001.mat", "W009_front_surprised_level3_001.mat", "W011_front_angry_level3_001.mat", "W011_front_contempt_level3_001.mat", "W011_front_disgusted_level3_001.mat", "W011_front_fear_level3_001.mat", "W011_front_happy_level3_001.mat", "W011_front_neutral_level1_001.mat", "W011_front_sad_level3_001.mat", "W011_front_surprised_level3_001.mat" ], value = "M030_front_neutral_level1_001.mat" ) gr.Examples( examples = [ "data/src_img/uncropped/face3.png", "data/src_img/uncropped/male_face.png", "data/src_img/uncropped/uncut_src_img.jpg", "data/src_img/cropped/chpa5.png", "data/src_img/cropped/cut_img.png", "data/src_img/cropped/f30.png", "data/src_img/cropped/menglu2.png", "data/src_img/cropped/nscu2.png", "data/src_img/cropped/zp1.png", "data/src_img/cropped/zt12.png" ], inputs=[image_path], examples_per_page=5 ) run_btn = gr.Button("Run") with gr.Column(): output_video = gr.Video(format="mp4") gr.HTML(""" """) run_btn.click( fn = infer, inputs = [audio_input, image_path, emotional_style], outputs = [output_video] ) demo.queue().launch()