import spaces import os import shutil from huggingface_hub import snapshot_download import gradio as gr os.chdir(os.path.dirname(os.path.abspath(__file__))) from scripts.inference import inference_process import argparse import uuid is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] else False if(not is_shared_ui): hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models") @spaces.GPU(duration=230) def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)): if is_shared_ui: raise gr.Error("This Space only works in duplicated instances") unique_id = uuid.uuid4() args = argparse.Namespace( config='configs/inference/default.yaml', source_image=source_image, driving_audio=driving_audio, output=f'output-{unique_id}.mp4', pose_weight=1.0, face_weight=1.0, lip_weight=1.0, face_expand_ratio=1.2, checkpoint=None ) inference_process(args) return f'output-{unique_id}.mp4' css = ''' div#warning-ready { background-color: #ecfdf5; padding: 0 16px 16px; margin: 20px 0; color: #030303!important; } div#warning-ready > .gr-prose > h2, div#warning-ready > .gr-prose > p { color: #057857!important; } div#warning-duplicate { background-color: #ebf5ff; padding: 0 16px 16px; margin: 20px 0; color: #030303!important; } div#warning-duplicate > .gr-prose > h2, div#warning-duplicate > .gr-prose > p { color: #0f4592!important; } div#warning-duplicate strong { color: #0f4592; } p.actions { display: flex; align-items: center; margin: 20px 0; } div#warning-duplicate .actions a { display: inline-block; margin-right: 10px; } .dark #warning-duplicate { background-color: #0c0c0c !important; border: 1px solid white !important; } ''' with gr.Blocks(css=css) as demo: if is_shared_ui: top_description = gr.HTML(f'''
''', elem_id="warning-duplicate") gr.Markdown("# Demo for Hallo: Hierarchical Audio-Driven Visual Synthesis for Portrait Image Animation") gr.Markdown("Generate talking head avatars driven from audio. **5 seconds of audio takes >10 minutes to generate on an L4** - duplicate the space for private use or try for free on Google Colab") gr.Markdown(""" Hallo has a few simple requirements for input data: For the source image: 1. It should be cropped into squares. 2. The face should be the main focus, making up 50%-70% of the image. 3. The face should be facing forward, with a rotation angle of less than 30° (no side profiles). For the driving audio: 1. It must be in WAV format. 2. It must be in English since our training datasets are only in this language. 3. Ensure the vocals are clear; background music is acceptable. We have provided some [samples](https://huggingface.co/datasets/fudan-generative-ai/hallo_inference_samples) for your reference. """) with gr.Row(): with gr.Column(): avatar_face = gr.Image(type="filepath", label="Face") driving_audio = gr.Audio(type="filepath", label="Driving audio") generate = gr.Button("Generate") with gr.Column(): output_video = gr.Video(label="Your talking head") generate.click( fn=run_inference, inputs=[avatar_face, driving_audio], outputs=output_video ) demo.launch()