hallo

Sleeping

App Files Files Community

saicharan1234 commited on Jun 27, 2024

Commit

8eca9ee

verified ·

1 Parent(s): 69dd2a2

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -9

app.py CHANGED Viewed

@@ -2,16 +2,34 @@ import os
 import shutil
 from huggingface_hub import snapshot_download
 import gradio as gr
-os.chdir(os.path.dirname(os.path.abspath(__file__)))
-from scripts.inference import inference_process
 import argparse
 import uuid
 is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] else False
 if not is_shared_ui:
     hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
 def run_inference(source_image, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio, progress=gr.Progress(track_tqdm=True)):
     if is_shared_ui:
         raise gr.Error("This Space only works in duplicated instances")
@@ -33,23 +51,26 @@ def run_inference(source_image, driving_audio, pose_weight, face_weight, lip_wei
     inference_process(args)
     return f'output-{unique_id}.mp4'
-with gr.Blocks(theme='freddyaboulton/dracula_revamped@0.3.8' ) as demo:
     gr.Markdown(
         """
-        # Talking Head Generation
         Upload a face image and driving audio, and adjust the weights to generate a talking head video.
         """
     )
     with gr.Row():
         with gr.Column():
-            avatar_face = gr.Image(type="filepath", label="Face", elem_id="face-input")
-            driving_audio = gr.Audio(type="filepath", label="Driving Audio", elem_id="audio-input")
         with gr.Column():
-            output_video = gr.Video(label="Your Talking Head", elem_id="output-video")
             with gr.Accordion("Advanced Settings", open=False):
                 pose_weight = gr.Slider(minimum=0.0, value=1.5, label="Pose Weight")
                 face_weight = gr.Slider(minimum=0.0, value=1.0, label="Face Weight")
@@ -57,6 +78,7 @@ with gr.Blocks(theme='freddyaboulton/dracula_revamped@0.3.8' ) as demo:
                 face_expand_ratio = gr.Slider(minimum=0.0, value=1.2, label="Face Expand Ratio")
             generate = gr.Button("Generate", elem_id="generate-button")
     generate.click(
         fn=run_inference,

 import shutil
 from huggingface_hub import snapshot_download
 import gradio as gr
+import numpy as np
+from PIL import Image
+import soundfile as sf
 import argparse
 import uuid
+os.chdir(os.path.dirname(os.path.abspath(__file__)))
+from scripts.inference import inference_process
 is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] else False
 if not is_shared_ui:
     hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
+def check_image_square(image_path):
+    image = Image.open(image_path)
+    if image.width != image.height:
+        raise gr.Error("The uploaded image is not square. Please upload a square image.")
+    return image_path
+def convert_audio_to_wav(audio_path):
+    if not audio_path.endswith('.wav'):
+        audio_data, samplerate = sf.read(audio_path)
+        wav_path = audio_path.rsplit('.', 1)[0] + '.wav'
+        sf.write(wav_path, audio_data, samplerate)
+        return wav_path
+    return audio_path
 def run_inference(source_image, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio, progress=gr.Progress(track_tqdm=True)):
     if is_shared_ui:
         raise gr.Error("This Space only works in duplicated instances")
     inference_process(args)
     return f'output-{unique_id}.mp4'
+with gr.Blocks(theme='freddyaboulton/dracula_revamped@0.3.8') as demo:
     gr.Markdown(
         """
+        # Talking Head Generation :🗣️📢
         Upload a face image and driving audio, and adjust the weights to generate a talking head video.
+        > **Note:**
+        > - The face should be the main focus, making up 50%-70% of the image.
+        > - The face should be facing forward, with a rotation angle of less than 30° (no side profiles).
+        > - To make it work, duplicate the Space and run it on your own profile using a private GPU.
+        > - An L4 costs US$0.80/h.
         """
     )
     with gr.Row():
         with gr.Column():
+            avatar_face = gr.Image(type="filepath", label="Face", elem_id="face-input").change(check_image_square, avatar_face)
+            driving_audio = gr.Audio(type="filepath", label="Driving Audio", elem_id="audio-input").change(convert_audio_to_wav, driving_audio)
         with gr.Column():
             with gr.Accordion("Advanced Settings", open=False):
                 pose_weight = gr.Slider(minimum=0.0, value=1.5, label="Pose Weight")
                 face_weight = gr.Slider(minimum=0.0, value=1.0, label="Face Weight")
                 face_expand_ratio = gr.Slider(minimum=0.0, value=1.2, label="Face Expand Ratio")
             generate = gr.Button("Generate", elem_id="generate-button")
+            output_video = gr.Video(label="Your Talking Head", elem_id="output-video")
     generate.click(
         fn=run_inference,