Spaces:
Sleeping
Sleeping
import os | |
import shutil | |
from huggingface_hub import snapshot_download | |
import gradio as gr | |
import numpy as np | |
from PIL import Image | |
import soundfile as sf | |
import argparse | |
import uuid | |
os.chdir(os.path.dirname(os.path.abspath(__file__))) | |
from scripts.inference import inference_process | |
is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] else False | |
if not is_shared_ui: | |
hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models") | |
def check_image_square(image_path): | |
image = Image.open(image_path) | |
if image.width != image.height: | |
raise gr.Error("The uploaded image is not square. Please upload a square image.") | |
return image_path | |
def convert_audio_to_wav(audio_path): | |
if not audio_path.endswith('.wav'): | |
audio_data, samplerate = sf.read(audio_path) | |
wav_path = audio_path.rsplit('.', 1)[0] + '.wav' | |
sf.write(wav_path, audio_data, samplerate) | |
return wav_path | |
return audio_path | |
def run_inference(source_image, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio, progress=gr.Progress(track_tqdm=True)): | |
if is_shared_ui: | |
raise gr.Error("This Space only works in duplicated instances") | |
unique_id = uuid.uuid4() | |
args = argparse.Namespace( | |
config='configs/inference/default.yaml', | |
source_image=source_image, | |
driving_audio=driving_audio, | |
output=f'output-{unique_id}.mp4', | |
pose_weight=pose_weight, | |
face_weight=face_weight, | |
lip_weight=lip_weight, | |
face_expand_ratio=face_expand_ratio, | |
checkpoint=None | |
) | |
inference_process(args) | |
return f'output-{unique_id}.mp4' | |
with gr.Blocks(theme='freddyaboulton/dracula_revamped@0.3.8') as demo: | |
gr.Markdown( | |
""" | |
# Talking Head Generation :🗣️📢 | |
Upload a face image and driving audio, and adjust the weights to generate a talking head video. | |
> **Note:** | |
> - The face should be the main focus, making up 50%-70% of the image. | |
> - The face should be facing forward, with a rotation angle of less than 30° (no side profiles). | |
> - To make it work, duplicate the Space and run it on your own profile using a private GPU. | |
> - An L4 costs US$0.80/h. | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
avatar_face = gr.Image(type="filepath", label="Face", elem_id="face-input").change(check_image_square, avatar_face) | |
driving_audio = gr.Audio(type="filepath", label="Driving Audio", elem_id="audio-input").change(convert_audio_to_wav, driving_audio) | |
with gr.Column(): | |
with gr.Accordion("Advanced Settings", open=False): | |
pose_weight = gr.Slider(minimum=0.0, value=1.5, label="Pose Weight") | |
face_weight = gr.Slider(minimum=0.0, value=1.0, label="Face Weight") | |
lip_weight = gr.Slider(minimum=0.0, value=1.1, label="Lip Weight") | |
face_expand_ratio = gr.Slider(minimum=0.0, value=1.2, label="Face Expand Ratio") | |
generate = gr.Button("Generate", elem_id="generate-button") | |
output_video = gr.Video(label="Your Talking Head", elem_id="output-video") | |
generate.click( | |
fn=run_inference, | |
inputs=[avatar_face, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio], | |
outputs=output_video | |
) | |
demo.launch() | |