leeway.zlw commited on
Commit
69c71b8
1 Parent(s): ca33a23
Files changed (1) hide show
  1. app.py +27 -6
app.py CHANGED
@@ -12,7 +12,7 @@ is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] els
12
  if(not is_shared_ui):
13
  hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
14
 
15
- def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)):
16
  if is_shared_ui:
17
  raise gr.Error("This Space only works in duplicated instances")
18
 
@@ -23,10 +23,10 @@ def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=T
23
  source_image=source_image,
24
  driving_audio=driving_audio,
25
  output=f'output-{unique_id}.mp4',
26
- pose_weight=1.0,
27
- face_weight=1.0,
28
- lip_weight=1.0,
29
- face_expand_ratio=1.2,
30
  checkpoint=None
31
  )
32
 
@@ -91,17 +91,38 @@ with gr.Blocks(css=css) as demo:
91
  ''', elem_id="warning-duplicate")
92
  gr.Markdown("# Demo for Hallo: Hierarchical Audio-Driven Visual Synthesis for Portrait Image Animation")
93
  gr.Markdown("Generate talking head avatars driven from audio. **5 seconds of audio takes >10 minutes to generate on an L4** - duplicate the space for private use or try for free on Google Colab")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  with gr.Row():
95
  with gr.Column():
96
  avatar_face = gr.Image(type="filepath", label="Face")
97
  driving_audio = gr.Audio(type="filepath", label="Driving audio")
 
 
 
 
98
  generate = gr.Button("Generate")
99
  with gr.Column():
100
  output_video = gr.Video(label="Your talking head")
101
 
102
  generate.click(
103
  fn=run_inference,
104
- inputs=[avatar_face, driving_audio],
105
  outputs=output_video
106
  )
107
 
 
12
  if(not is_shared_ui):
13
  hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
14
 
15
+ def run_inference(source_image, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio, progress=gr.Progress(track_tqdm=True)):
16
  if is_shared_ui:
17
  raise gr.Error("This Space only works in duplicated instances")
18
 
 
23
  source_image=source_image,
24
  driving_audio=driving_audio,
25
  output=f'output-{unique_id}.mp4',
26
+ pose_weight=pose_weight,
27
+ face_weight=face_weight,
28
+ lip_weight=lip_weight,
29
+ face_expand_ratio=face_expand_ratio,
30
  checkpoint=None
31
  )
32
 
 
91
  ''', elem_id="warning-duplicate")
92
  gr.Markdown("# Demo for Hallo: Hierarchical Audio-Driven Visual Synthesis for Portrait Image Animation")
93
  gr.Markdown("Generate talking head avatars driven from audio. **5 seconds of audio takes >10 minutes to generate on an L4** - duplicate the space for private use or try for free on Google Colab")
94
+ gr.Markdown("""
95
+ Hallo has a few simple requirements for input data:
96
+
97
+ For the source image:
98
+
99
+ 1. It should be cropped into squares.
100
+ 2. The face should be the main focus, making up 50%-70% of the image.
101
+ 3. The face should be facing forward, with a rotation angle of less than 30° (no side profiles).
102
+
103
+ For the driving audio:
104
+
105
+ 1. It must be in WAV format.
106
+ 2. It must be in English since our training datasets are only in this language.
107
+ 3. Ensure the vocals are clear; background music is acceptable.
108
+
109
+ We have provided some [samples](https://huggingface.co/datasets/fudan-generative-ai/hallo_inference_samples) for your reference.
110
+ """)
111
  with gr.Row():
112
  with gr.Column():
113
  avatar_face = gr.Image(type="filepath", label="Face")
114
  driving_audio = gr.Audio(type="filepath", label="Driving audio")
115
+ pose_weight = gr.Number(label="pose weight", value=1.0),
116
+ face_weight = gr.Number(label="face weight", value=1.0),
117
+ lip_weight = gr.Number(label="lip weight", value=1.0),
118
+ face_expand_ratio = gr.Number(label="face expand ratio", value=1.2),
119
  generate = gr.Button("Generate")
120
  with gr.Column():
121
  output_video = gr.Video(label="Your talking head")
122
 
123
  generate.click(
124
  fn=run_inference,
125
+ inputs=[avatar_face, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio],
126
  outputs=output_video
127
  )
128