File size: 6,721 Bytes
1cd3497
 
2768572
79883d7
2768572
 
 
 
 
 
537ba12
2768572
79883d7
c42d6a9
 
 
1cd3497
 
 
f64097e
1cd3497
f64097e
1cd3497
 
 
 
ab7ed04
f64097e
30dd27d
ab7ed04
30dd27d
 
1cd3497
 
 
 
 
79883d7
2768572
 
 
 
 
 
1cd3497
ce7e2cb
 
595b65e
 
ce7e2cb
 
 
 
f905233
5db2efd
 
f64097e
5db2efd
 
f64097e
f905233
1cd3497
 
06a5b99
f64097e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
acab14f
f64097e
 
 
15b34e1
 
 
 
 
 
 
 
 
 
 
 
 
8f58bc8
5db9b85
15b34e1
9b98523
1cd3497
8ea3b63
8f58bc8
ce7e2cb
8f58bc8
1cd3497
 
 
f64097e
1cd3497
 
 
f64097e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import gradio as gr
import subprocess
from moviepy.editor import VideoFileClip

def convert_to_mp4_with_aac(input_path, output_path):
    # Load the video
    video = VideoFileClip(input_path)
    
    # Set the output format to mp4 with AAC codec
    video.write_videofile(output_path, codec="libx264", audio_codec="aac")

    return output_path

def load_audio(audio_listed):
    return f"data/audio/{audio_listed}"

def execute_command(command: str) -> None:
    subprocess.run(command, check=True)

def infer(audio_input, image_path, emotional_style):

    output_name = "lipsynced_result"

    command = [
        f"python",
        f"inference_for_demo_video.py",
        f"--wav_path={audio_input}",
        f"--style_clip_path=data/style_clip/3DMM/{emotional_style}",
        f"--pose_path=data/pose/RichardShelby_front_neutral_level1_001.mat",
        f"--image_path={image_path}",
        f"--cfg_scale=1.0",
        f"--max_gen_len=30",
        f"--output_name={output_name}"
    ]

    execute_command(command)

    # Convert video to compatible codecs
    input_file = f"output_video/{output_name}.mp4"
    output_file = f"{output_name}.mp4"
    
    result = convert_to_mp4_with_aac(input_file, output_file)
    
    return result

css="""
#col-container{
    margin: 0 auto;
    max-width: 940px;
}
"""
with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        gr.HTML("""
        <h1 style="text-align: center;">DreamTalk</h1>
        <h2 style="text-align: center;">When Expressive Talking Head Generation Meets Diffusion Probabilistic Models</h2>
        <p style="text-align: center;">
            DreamTalk is a diffusion-based audio-driven expressive talking head generation framework that can produce high-quality talking head videos across diverse speaking styles. <br />
            DreamTalk exhibits robust performance with a diverse array of inputs, including songs, speech in multiple languages, noisy audio, and out-of-domain portraits.
        </p>
        """)
        with gr.Row():
            with gr.Column():
                image_path = gr.Image(label="Image", type="filepath", sources=["upload"])
                audio_input = gr.Audio(label="Audio input", type="filepath", sources=["upload"], value="data/audio/acknowledgement_english.m4a")
                with gr.Row():
                    audio_list = gr.Dropdown(
                        label="Choose an audio (optional)",
                        choices=[
                            "German1.wav", "German2.wav", "German3.wav", "German4.wav",
                            "acknowledgement_chinese.m4a", "acknowledgement_english.m4a",
                            "chinese1_haierlizhi.wav", "chinese2_guanyu.wav",
                            "french1.wav", "french2.wav", "french3.wav",
                            "italian1.wav", "italian2.wav", "italian3.wav",
                            "japan1.wav", "japan2.wav", "japan3.wav",
                            "korean1.wav", "korean2.wav", "korean3.wav",
                            "noisy_audio_cafeter_snr_0.wav", "noisy_audio_meeting_snr_0.wav", "noisy_audio_meeting_snr_10.wav", "noisy_audio_meeting_snr_20.wav", "noisy_audio_narrative.wav", "noisy_audio_office_snr_0.wav", "out_of_domain_narrative.wav",
                            "spanish1.wav", "spanish2.wav", "spanish3.wav"
                        ],
                        value = "acknowledgement_english.m4a"
                    )
                    audio_list.change(
                        fn = load_audio,
                        inputs = [audio_list],
                        outputs = [audio_input]
                    )
                    emotional_style = gr.Dropdown(
                        label = "emotional style",
                        choices = [
                            "M030_front_angry_level3_001.mat",
                            "M030_front_contempt_level3_001.mat",
                            "M030_front_disgusted_level3_001.mat",
                            "M030_front_fear_level3_001.mat",
                            "M030_front_happy_level3_001.mat",
                            "M030_front_neutral_level1_001.mat",
                            "M030_front_sad_level3_001.mat",
                            "M030_front_surprised_level3_001.mat",
                            "W009_front_angry_level3_001.mat",
                            "W009_front_contempt_level3_001.mat",
                            "W009_front_disgusted_level3_001.mat",
                            "W009_front_fear_level3_001.mat",
                            "W009_front_happy_level3_001.mat",
                            "W009_front_neutral_level1_001.mat",
                            "W009_front_sad_level3_001.mat",
                            "W009_front_surprised_level3_001.mat",
                            "W011_front_angry_level3_001.mat",
                            "W011_front_contempt_level3_001.mat",
                            "W011_front_disgusted_level3_001.mat",
                            "W011_front_fear_level3_001.mat",
                            "W011_front_happy_level3_001.mat",
                            "W011_front_neutral_level1_001.mat",
                            "W011_front_sad_level3_001.mat",
                            "W011_front_surprised_level3_001.mat"
                        ],
                        value = "M030_front_neutral_level1_001.mat"
                    )
                gr.Examples(
                    examples = [
                        "data/src_img/uncropped/face3.png",
                        "data/src_img/uncropped/male_face.png",
                        "data/src_img/uncropped/uncut_src_img.jpg",
                        "data/src_img/cropped/chpa5.png",
                        "data/src_img/cropped/cut_img.png",
                        "data/src_img/cropped/f30.png",
                        "data/src_img/cropped/menglu2.png",
                        "data/src_img/cropped/nscu2.png",
                        "data/src_img/cropped/zp1.png",
                        "data/src_img/cropped/zt12.png"
                    ],
                    inputs=[image_path],
                    examples_per_page=5
                )
                run_btn = gr.Button("Run")
            with gr.Column():
                output_video = gr.Video(format="mp4")
                gr.HTML("""
                <img src="https://github.com/ali-vilab/dreamtalk/raw/main/media/teaser.gif" style="margin: 0 auto;border-radius: 10px;" />
                """)
    
    run_btn.click(
        fn = infer,
        inputs = [audio_input, image_path, emotional_style],
        outputs = [output_video]
    )

demo.queue().launch()