File size: 7,801 Bytes
1cd3497
 
2768572
79883d7
2768572
 
 
 
 
 
537ba12
2768572
79883d7
c42d6a9
2766d34
 
 
 
c42d6a9
1cd3497
 
 
f64097e
1cd3497
f64097e
1cd3497
 
 
 
ab7ed04
f64097e
30dd27d
ab7ed04
30dd27d
 
1cd3497
 
 
 
 
79883d7
2768572
 
 
 
 
 
1cd3497
ce7e2cb
 
595b65e
 
ce7e2cb
a460b8c
b248124
a460b8c
 
 
 
 
 
 
cda19ef
 
 
 
 
 
 
 
 
 
ce7e2cb
 
 
f905233
bd7e187
 
 
2bb7f9d
f64097e
bd7e187
f905233
1cd3497
 
06a5b99
f64097e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
acab14f
f64097e
 
 
15b34e1
 
 
 
 
 
 
 
 
 
 
 
 
8f58bc8
5db9b85
15b34e1
341a8a2
2766d34
cda19ef
1cd3497
8ea3b63
8f58bc8
2bb7f9d
 
 
b248124
8f58bc8
1cd3497
 
 
f64097e
1cd3497
 
 
f64097e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import gradio as gr
import subprocess
from moviepy.editor import VideoFileClip

def convert_to_mp4_with_aac(input_path, output_path):
    # Load the video
    video = VideoFileClip(input_path)
    
    # Set the output format to mp4 with AAC codec
    video.write_videofile(output_path, codec="libx264", audio_codec="aac")

    return output_path

def load_audio(audio_listed):
    if audio_listed is None:
        return None
    else:
        return f"data/audio/{audio_listed}"

def execute_command(command: str) -> None:
    subprocess.run(command, check=True)

def infer(audio_input, image_path, emotional_style):

    output_name = "lipsynced_result"

    command = [
        f"python",
        f"inference_for_demo_video.py",
        f"--wav_path={audio_input}",
        f"--style_clip_path=data/style_clip/3DMM/{emotional_style}",
        f"--pose_path=data/pose/RichardShelby_front_neutral_level1_001.mat",
        f"--image_path={image_path}",
        f"--cfg_scale=1.0",
        f"--max_gen_len=30",
        f"--output_name={output_name}"
    ]

    execute_command(command)

    # Convert video to compatible codecs
    input_file = f"output_video/{output_name}.mp4"
    output_file = f"{output_name}.mp4"
    
    result = convert_to_mp4_with_aac(input_file, output_file)
    
    return result

css="""
#col-container{
    margin: 0 auto;
    max-width: 940px;
}
#project-links{
    margin: 0 0 12px !important;
    column-gap: 8px;
    display: flex;
    justify-content: center;
    flex-wrap: nowrap;
    flex-direction: row;
    align-items: center;
}
#run-btn{
    border: var(--button-border-width) solid var(--button-primary-border-color);
    background: var(--button-primary-background-fill);
    color: var(--button-primary-text-color);
}
#run-btn:hover{
    border-color: var(--button-primary-border-color-hover);
    background: var(--button-primary-background-fill-hover);
    color: var(--button-primary-text-color-hover);
}
"""
with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        gr.HTML("""
        <h2 style="text-align: center;">DreamTalk</h2>
        <p style="text-align: center;">When Expressive Talking Head Generation Meets Diffusion Probabilistic Models</p>
        <p style="margin:12px auto;display: flex;justify-content: center;">
            <a href="https://huggingface.co/spaces/fffiloni/dreamtalk?duplicate=true"><img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-lg.svg" alt="Duplicate this Space"></a>
        </p>
        
        """)
        with gr.Row():
            with gr.Column():
                image_path = gr.Image(label="Image", type="filepath", sources=["upload"])
                audio_input = gr.Audio(label="Audio input", type="filepath", sources=["upload"], value="data/audio/acknowledgement_english.m4a")
                with gr.Row():
                    audio_list = gr.Dropdown(
                        label="Choose an audio (optional)",
                        choices=[
                            "German1.wav", "German2.wav", "German3.wav", "German4.wav",
                            "acknowledgement_chinese.m4a", "acknowledgement_english.m4a",
                            "chinese1_haierlizhi.wav", "chinese2_guanyu.wav",
                            "french1.wav", "french2.wav", "french3.wav",
                            "italian1.wav", "italian2.wav", "italian3.wav",
                            "japan1.wav", "japan2.wav", "japan3.wav",
                            "korean1.wav", "korean2.wav", "korean3.wav",
                            "noisy_audio_cafeter_snr_0.wav", "noisy_audio_meeting_snr_0.wav", "noisy_audio_meeting_snr_10.wav", "noisy_audio_meeting_snr_20.wav", "noisy_audio_narrative.wav", "noisy_audio_office_snr_0.wav", "out_of_domain_narrative.wav",
                            "spanish1.wav", "spanish2.wav", "spanish3.wav"
                        ],
                        value = "acknowledgement_english.m4a"
                    )
                    audio_list.change(
                        fn = load_audio,
                        inputs = [audio_list],
                        outputs = [audio_input]
                    )
                    emotional_style = gr.Dropdown(
                        label = "emotional style",
                        choices = [
                            "M030_front_angry_level3_001.mat",
                            "M030_front_contempt_level3_001.mat",
                            "M030_front_disgusted_level3_001.mat",
                            "M030_front_fear_level3_001.mat",
                            "M030_front_happy_level3_001.mat",
                            "M030_front_neutral_level1_001.mat",
                            "M030_front_sad_level3_001.mat",
                            "M030_front_surprised_level3_001.mat",
                            "W009_front_angry_level3_001.mat",
                            "W009_front_contempt_level3_001.mat",
                            "W009_front_disgusted_level3_001.mat",
                            "W009_front_fear_level3_001.mat",
                            "W009_front_happy_level3_001.mat",
                            "W009_front_neutral_level1_001.mat",
                            "W009_front_sad_level3_001.mat",
                            "W009_front_surprised_level3_001.mat",
                            "W011_front_angry_level3_001.mat",
                            "W011_front_contempt_level3_001.mat",
                            "W011_front_disgusted_level3_001.mat",
                            "W011_front_fear_level3_001.mat",
                            "W011_front_happy_level3_001.mat",
                            "W011_front_neutral_level1_001.mat",
                            "W011_front_sad_level3_001.mat",
                            "W011_front_surprised_level3_001.mat"
                        ],
                        value = "M030_front_neutral_level1_001.mat"
                    )
                gr.Examples(
                    examples = [
                        "data/src_img/uncropped/face3.png",
                        "data/src_img/uncropped/male_face.png",
                        "data/src_img/uncropped/uncut_src_img.jpg",
                        "data/src_img/cropped/chpa5.png",
                        "data/src_img/cropped/cut_img.png",
                        "data/src_img/cropped/f30.png",
                        "data/src_img/cropped/menglu2.png",
                        "data/src_img/cropped/nscu2.png",
                        "data/src_img/cropped/zp1.png",
                        "data/src_img/cropped/zt12.png"
                    ],
                    inputs=[image_path],
                    examples_per_page=5
                )
                with gr.Row():
                    gr.ClearButton([audio_input, image_path, audio_list])
                    run_btn = gr.Button("Run", elem_id="run-btn")
            with gr.Column():
                output_video = gr.Video(format="mp4")
                gr.HTML("""
                <p id="project-links" align="center">
                  <a href='https://dreamtalk-project.github.io/'><img src='https://img.shields.io/badge/Project-Page-Green'></a> <a href='https://arxiv.org/abs/2312.09767'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a> <a href='https://youtu.be/VF4vlE6ZqWQ'><img src='https://badges.aleen42.com/src/youtube.svg'></a>
                </p>
                <img src="https://github.com/ali-vilab/dreamtalk/raw/main/media/teaser.gif" style="margin: 0 auto;border-radius: 10px;" />    
                """)
    
    run_btn.click(
        fn = infer,
        inputs = [audio_input, image_path, emotional_style],
        outputs = [output_video]
    )

demo.queue().launch()