hallo

Sleeping

App Files Files Community

hallo / app.py

saicharan1234

Update app.py

614db49 verified 4 months ago

raw

history blame

3.58 kB

	import os
	import shutil
	from huggingface_hub import snapshot_download
	import gradio as gr
	import numpy as np
	from PIL import Image
	import soundfile as sf
	import argparse
	import uuid

	os.chdir(os.path.dirname(os.path.abspath(__file__)))
	from scripts.inference import inference_process

	is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] else False

	if not is_shared_ui:
	hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")

	def check_image_square(image_path):
	image = Image.open(image_path)
	if image.width != image.height:
	raise gr.Error("The uploaded image is not square. Please upload a square image.")
	return image_path

	def convert_audio_to_wav(audio_path):
	if not audio_path.endswith('.wav'):
	audio_data, samplerate = sf.read(audio_path)
	wav_path = audio_path.rsplit('.', 1)[0] + '.wav'
	sf.write(wav_path, audio_data, samplerate)
	return wav_path
	return audio_path

	def run_inference(source_image, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio, progress=gr.Progress(track_tqdm=True)):
	if is_shared_ui:
	raise gr.Error("This Space only works in duplicated instances")

	unique_id = uuid.uuid4()

	args = argparse.Namespace(
	config='configs/inference/default.yaml',
	source_image=source_image,
	driving_audio=driving_audio,
	output=f'output-{unique_id}.mp4',
	pose_weight=pose_weight,
	face_weight=face_weight,
	lip_weight=lip_weight,
	face_expand_ratio=face_expand_ratio,
	checkpoint=None
	)

	inference_process(args)
	return f'output-{unique_id}.mp4'

	with gr.Blocks(theme='freddyaboulton/dracula_revamped@0.3.8') as demo:
	gr.Markdown(
	"""
	# Talking Head Generation
	Upload a face image and driving audio, and adjust the weights to generate a talking head video.

	> Note:
	> - The face should be the main focus, making up 50%-70% of the image.
	> - The face should be facing forward, with a rotation angle of less than 30° (no side profiles).
	> - To make it work, duplicate the Space and run it on your own profile using a private GPU.
	> - An L4 costs US$0.80/h.
	"""
	)

	with gr.Row():
	with gr.Column():
	avatar_face = gr.Image(type="filepath", label="Face", elem_id="face-input")
	driving_audio = gr.Audio(type="filepath", label="Driving Audio", elem_id="audio-input")

	with gr.Column():
	with gr.Accordion("Advanced Settings", open=False):
	pose_weight = gr.Slider(minimum=0.0, value=1.5, label="Pose Weight")
	face_weight = gr.Slider(minimum=0.0, value=1.0, label="Face Weight")
	lip_weight = gr.Slider(minimum=0.0, value=1.1, label="Lip Weight")
	face_expand_ratio = gr.Slider(minimum=0.0, value=1.2, label="Face Expand Ratio")

	generate = gr.Button("Generate", elem_id="generate-button")
	output_video = gr.Video(label="Your Talking Head", elem_id="output-video")

	avatar_face.change(fn=check_image_square, inputs=avatar_face, outputs=avatar_face)
	driving_audio.change(fn=convert_audio_to_wav, inputs=driving_audio, outputs=driving_audio)

	generate.click(
	fn=run_inference,
	inputs=[avatar_face, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio],
	outputs=output_video
	)

	demo.launch()