File size: 3,578 Bytes
63f899c
 
 
 
8eca9ee
 
 
63f899c
59d9186
63f899c
8eca9ee
 
 
0f1045d
bd786ec
7b988f1
c39b894
63f899c
8eca9ee
 
 
 
 
 
 
 
 
 
 
 
 
 
7b988f1
a6075c0
 
 
59d9186
 
63f899c
59d9186
751c5b7
 
59d9186
7b988f1
 
 
 
59d9186
63f899c
 
 
59d9186
63f899c
8eca9ee
7b988f1
 
614db49
7b988f1
8eca9ee
 
 
 
 
 
7b988f1
 
 
a6075c0
 
614db49
 
 
69dd2a2
7b988f1
 
 
 
 
 
 
8eca9ee
a6075c0
614db49
 
 
a6075c0
 
7b988f1
a6075c0
 
 
7b988f1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import shutil
from huggingface_hub import snapshot_download
import gradio as gr
import numpy as np
from PIL import Image
import soundfile as sf
import argparse
import uuid

os.chdir(os.path.dirname(os.path.abspath(__file__)))
from scripts.inference import inference_process

is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] else False

if not is_shared_ui:
    hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")

def check_image_square(image_path):
    image = Image.open(image_path)
    if image.width != image.height:
        raise gr.Error("The uploaded image is not square. Please upload a square image.")
    return image_path

def convert_audio_to_wav(audio_path):
    if not audio_path.endswith('.wav'):
        audio_data, samplerate = sf.read(audio_path)
        wav_path = audio_path.rsplit('.', 1)[0] + '.wav'
        sf.write(wav_path, audio_data, samplerate)
        return wav_path
    return audio_path

def run_inference(source_image, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio, progress=gr.Progress(track_tqdm=True)):
    if is_shared_ui:
        raise gr.Error("This Space only works in duplicated instances")
        
    unique_id = uuid.uuid4()
    
    args = argparse.Namespace(
        config='configs/inference/default.yaml',
        source_image=source_image,
        driving_audio=driving_audio,
        output=f'output-{unique_id}.mp4',
        pose_weight=pose_weight,
        face_weight=face_weight,
        lip_weight=lip_weight,
        face_expand_ratio=face_expand_ratio,
        checkpoint=None
    )
    
    inference_process(args)
    return f'output-{unique_id}.mp4' 

with gr.Blocks(theme='freddyaboulton/dracula_revamped@0.3.8') as demo:
    gr.Markdown(
        """
        # Talking Head Generation
        Upload a face image and driving audio, and adjust the weights to generate a talking head video.
        
        > **Note:**
        > - The face should be the main focus, making up 50%-70% of the image.
        > - The face should be facing forward, with a rotation angle of less than 30° (no side profiles).
        > - To make it work, duplicate the Space and run it on your own profile using a private GPU.
        > - An L4 costs US$0.80/h.
        """
    )
    
    with gr.Row():
        with gr.Column():
            avatar_face = gr.Image(type="filepath", label="Face", elem_id="face-input")
            driving_audio = gr.Audio(type="filepath", label="Driving Audio", elem_id="audio-input")
            
        with gr.Column():
            with gr.Accordion("Advanced Settings", open=False):
                pose_weight = gr.Slider(minimum=0.0, value=1.5, label="Pose Weight")
                face_weight = gr.Slider(minimum=0.0, value=1.0, label="Face Weight")
                lip_weight = gr.Slider(minimum=0.0, value=1.1, label="Lip Weight")
                face_expand_ratio = gr.Slider(minimum=0.0, value=1.2, label="Face Expand Ratio")
            
            generate = gr.Button("Generate", elem_id="generate-button")
            output_video = gr.Video(label="Your Talking Head", elem_id="output-video")

    avatar_face.change(fn=check_image_square, inputs=avatar_face, outputs=avatar_face)
    driving_audio.change(fn=convert_audio_to_wav, inputs=driving_audio, outputs=driving_audio)

    generate.click(
        fn=run_inference,
        inputs=[avatar_face, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio],
        outputs=output_video
    )
    
demo.launch()