Spaces:

EvanTHU
/

MotionCLR

Running on Zero

App Files Files Community

MotionCLR / app.py

EvanTHU

init demo

b887ad8 verified 4 months ago

raw

history blame

20.8 kB

	import spaces
	import gradio as gr
	import sys
	import os
	import torch
	import numpy as np
	from os.path import join as pjoin
	import utils.paramUtil as paramUtil
	from utils.plot_script import *
	from utils.utils import *
	from utils.motion_process import recover_from_ric
	from accelerate.utils import set_seed
	from models.gaussian_diffusion import DiffusePipeline
	from options.generate_options import GenerateOptions
	from utils.model_load import load_model_weights
	from motion_loader import get_dataset_loader
	from models import build_models
	import yaml
	import time
	from box import Box
	import hashlib
	from huggingface_hub import hf_hub_download

	ckptdir = './checkpoints/t2m/release'
	os.makedirs(ckptdir, exist_ok=True)


	mean_path = hf_hub_download(
	repo_id="EvanTHU/MotionCLR",
	filename="meta/mean.npy",
	local_dir=ckptdir,
	local_dir_use_symlinks=False
	)

	std_path = hf_hub_download(
	repo_id="EvanTHU/MotionCLR",
	filename="meta/std.npy",
	local_dir=ckptdir,
	local_dir_use_symlinks=False
	)

	model_path = hf_hub_download(
	repo_id="EvanTHU/MotionCLR",
	filename="model/latest.tar",
	local_dir=ckptdir,
	local_dir_use_symlinks=False
	)

	opt_path = hf_hub_download(
	repo_id="EvanTHU/MotionCLR",
	filename="opt.txt",
	local_dir=ckptdir,
	local_dir_use_symlinks=False
	)



	os.makedirs("tmp", exist_ok=True)
	os.environ['GRADIO_TEMP_DIR'] = './tmp'

	def generate_md5(input_string):
	# Encode the string and compute the MD5 hash
	md5_hash = hashlib.md5(input_string.encode())
	# Return the hexadecimal representation of the hash
	return md5_hash.hexdigest()

	def set_all_use_to_false(data):
	for key, value in data.items():
	if isinstance(value, Box):
	set_all_use_to_false(value)
	elif key == 'use':
	data[key] = False
	return data

	def yaml_to_box(yaml_file):
	with open(yaml_file, 'r') as file:
	yaml_data = yaml.safe_load(file)

	return Box(yaml_data)

	HEAD = """<div class="embed_hidden">
	<h1 style='text-align: center'> MotionCLR User Interaction Demo </h1>
	"""

	edit_config = yaml_to_box('options/edit.yaml')
	os.environ['GRADIO_TEMP_DIR'] = './tmp'
	CSS = """
	.retrieved_video {
	position: relative;
	margin: 0;
	box-shadow: var(--block-shadow);
	border-width: var(--block-border-width);
	border-color: #000000;
	border-radius: var(--block-radius);
	background: var(--block-background-fill);
	width: 100%;
	line-height: var(--line-sm);
	}
	.contour_video {
	display: flex;
	flex-direction: column;
	justify-content: center;
	align-items: center;
	z-index: var(--layer-5);
	border-radius: var(--block-radius);
	background: var(--background-fill-primary);
	padding: 0 var(--size-6);
	max-height: var(--size-screen-h);
	overflow: hidden;
	}
	"""

	def generate_video_from_text(text, opt, pipeline):
	width = 500
	height = 500
	texts = [text]
	motion_lens = [opt.motion_length * opt.fps for _ in range(opt.num_samples)]

	save_dir = './tmp/gen/'
	filename = generate_md5(str(time.time())) + ".mp4"
	save_path = pjoin(save_dir, str(filename))
	os.makedirs(save_dir, exist_ok=True)

	start_time = time.perf_counter()
	gr.Info("Generating motion...", duration = 3)
	pred_motions, _ = pipeline.generate(texts, torch.LongTensor([int(x) for x in motion_lens]))
	end_time = time.perf_counter()
	exc = end_time - start_time
	gr.Info(f"Generating time cost: {exc:.2f} s, rendering starts...", duration = 3)
	start_time = time.perf_counter()
	mean = np.load(pjoin(opt.meta_dir, 'mean.npy'))
	std = np.load(pjoin(opt.meta_dir, 'std.npy'))


	samples = []

	root_list = []
	for i, motion in enumerate(pred_motions):
	motion = motion.cpu().numpy() * std + mean
	# 1. recover 3d joints representation by ik
	motion = recover_from_ric(torch.from_numpy(motion).float(), opt.joints_num)
	# 2. put on Floor (Y axis)
	floor_height = motion.min(dim=0)[0].min(dim=0)[0][1]
	motion[:, :, 1] -= floor_height
	motion = motion.numpy()
	# 3. remove jitter
	motion = motion_temporal_filter(motion, sigma=1)

	samples.append(motion)

	i = 0
	title = texts[i]
	motion = samples[i]
	kinematic_tree = paramUtil.t2m_kinematic_chain if (opt.dataset_name == 't2m') else paramUtil.kit_kinematic_chain
	plot_3d_motion(save_path, kinematic_tree, motion, title=title, fps=opt.fps, radius=opt.radius)

	gr.Info("Rendered motion...", duration = 3)
	end_time = time.perf_counter()
	exc = end_time - start_time
	gr.Info(f"Rendering time cost: {exc:.2f} s", duration = 3)

	video_dis = f'<video controls playsinline width="{width}" style="display: block; margin: 0 auto;" src="./file={save_path}"></video>'
	style_dis = video_dis + """<br> <p align="center"> Content Reference </p>"""
	global edit_config
	edit_config = set_all_use_to_false(edit_config)
	return video_dis, video_dis, video_dis, video_dis, style_dis, video_dis, gr.update(visible=True)

	def reweighting(text, idx, weight, opt, pipeline):
	global edit_config
	edit_config.reweighting_attn.use = True
	edit_config.reweighting_attn.idx = idx
	edit_config.reweighting_attn.reweighting_attn_weight = weight


	gr.Info("Loading Configurations...", duration = 3)
	model = build_models(opt, edit_config=edit_config)
	ckpt_path = pjoin(opt.model_dir, opt.which_ckpt + '.tar')
	niter = load_model_weights(model, ckpt_path, use_ema=not opt.no_ema)

	pipeline = DiffusePipeline(
	opt = opt,
	model = model,
	diffuser_name = opt.diffuser_name,
	device=opt.device,
	num_inference_steps=opt.num_inference_steps,
	torch_dtype=torch.float16,
	)

	print(edit_config)

	width = 500
	height = 500
	texts = [text, text]
	motion_lens = [opt.motion_length * opt.fps for _ in range(opt.num_samples)]

	save_dir = './tmp/gen/'
	filenames = [generate_md5(str(time.time())) + ".mp4", generate_md5(str(time.time())) + ".mp4"]
	save_paths = [pjoin(save_dir, str(filenames[0])), pjoin(save_dir, str(filenames[1]))]
	os.makedirs(save_dir, exist_ok=True)

	start_time = time.perf_counter()
	gr.Info("Generating motion...", duration = 3)
	pred_motions, _ = pipeline.generate(texts, torch.LongTensor([int(x) for x in motion_lens]))
	end_time = time.perf_counter()
	exc = end_time - start_time
	gr.Info(f"Generating time cost: {exc:.2f} s, rendering starts...", duration = 3)
	start_time = time.perf_counter()
	mean = np.load(pjoin(opt.meta_dir, 'mean.npy'))
	std = np.load(pjoin(opt.meta_dir, 'std.npy'))


	samples = []

	root_list = []
	for i, motion in enumerate(pred_motions):
	motion = motion.cpu().numpy() * std + mean
	# 1. recover 3d joints representation by ik
	motion = recover_from_ric(torch.from_numpy(motion).float(), opt.joints_num)
	# 2. put on Floor (Y axis)
	floor_height = motion.min(dim=0)[0].min(dim=0)[0][1]
	motion[:, :, 1] -= floor_height
	motion = motion.numpy()
	# 3. remove jitter
	motion = motion_temporal_filter(motion, sigma=1)

	samples.append(motion)

	i = 1
	title = texts[i]
	motion = samples[i]
	kinematic_tree = paramUtil.t2m_kinematic_chain if (opt.dataset_name == 't2m') else paramUtil.kit_kinematic_chain
	plot_3d_motion(save_paths[1], kinematic_tree, motion, title=title, fps=opt.fps, radius=opt.radius)


	gr.Info("Rendered motion...", duration = 3)
	end_time = time.perf_counter()
	exc = end_time - start_time
	gr.Info(f"Rendering time cost: {exc:.2f} s", duration = 3)

	video_dis = f'<video controls playsinline width="{width}" style="display: block; margin: 0 auto;" src="./file={save_paths[1]}"></video>'


	edit_config = set_all_use_to_false(edit_config)
	return video_dis

	def generate_example_based_motion(text, chunk_size, example_based_steps_end, temp_seed, temp_seed_bar, num_motion, opt, pipeline):
	global edit_config
	edit_config.example_based.use = True
	edit_config.example_based.chunk_size = chunk_size
	edit_config.example_based.example_based_steps_end = example_based_steps_end
	edit_config.example_based.temp_seed = temp_seed
	edit_config.example_based.temp_seed_bar = temp_seed_bar


	gr.Info("Loading Configurations...", duration = 3)
	model = build_models(opt, edit_config=edit_config)
	ckpt_path = pjoin(opt.model_dir, opt.which_ckpt + '.tar')
	niter = load_model_weights(model, ckpt_path, use_ema=not opt.no_ema)

	pipeline = DiffusePipeline(
	opt = opt,
	model = model,
	diffuser_name = opt.diffuser_name,
	device=opt.device,
	num_inference_steps=opt.num_inference_steps,
	torch_dtype=torch.float16,
	)

	width = 500
	height = 500
	texts = [text for _ in range(num_motion)]
	motion_lens = [opt.motion_length * opt.fps for _ in range(opt.num_samples)]

	save_dir = './tmp/gen/'
	filenames = [generate_md5(str(time.time())) + ".mp4" for _ in range(num_motion)]
	save_paths = [pjoin(save_dir, str(filenames[i])) for i in range(num_motion)]
	os.makedirs(save_dir, exist_ok=True)

	start_time = time.perf_counter()
	gr.Info("Generating motion...", duration = 3)
	pred_motions, _ = pipeline.generate(texts, torch.LongTensor([int(x) for x in motion_lens]))
	end_time = time.perf_counter()
	exc = end_time - start_time
	gr.Info(f"Generating time cost: {exc:.2f} s, rendering starts...", duration = 3)
	start_time = time.perf_counter()
	mean = np.load(pjoin(opt.meta_dir, 'mean.npy'))
	std = np.load(pjoin(opt.meta_dir, 'std.npy'))


	samples = []

	root_list = []
	progress=gr.Progress()
	progress(0, desc="Starting...")
	for i, motion in enumerate(pred_motions):
	motion = motion.cpu().numpy() * std + mean
	# 1. recover 3d joints representation by ik
	motion = recover_from_ric(torch.from_numpy(motion).float(), opt.joints_num)
	# 2. put on Floor (Y axis)
	floor_height = motion.min(dim=0)[0].min(dim=0)[0][1]
	motion[:, :, 1] -= floor_height
	motion = motion.numpy()
	# 3. remove jitter
	motion = motion_temporal_filter(motion, sigma=1)

	samples.append(motion)

	video_dis = []
	i = 0
	for title in progress.tqdm(texts):
	print(save_paths[i])
	title = texts[i]
	motion = samples[i]
	kinematic_tree = paramUtil.t2m_kinematic_chain if (opt.dataset_name == 't2m') else paramUtil.kit_kinematic_chain
	plot_3d_motion(save_paths[i], kinematic_tree, motion, title=title, fps=opt.fps, radius=opt.radius)
	video_html = f'''
	<video class="retrieved_video" width="{width}" height="{height}" preload="auto" muted playsinline onpause="this.load()" autoplay loop disablepictureinpicture src="./file={save_paths[i]}"> </video>
	'''
	video_dis.append(video_html)
	i += 1

	for _ in range(24 - num_motion):
	video_dis.append(None)
	gr.Info("Rendered motion...", duration = 3)
	end_time = time.perf_counter()
	exc = end_time - start_time
	gr.Info(f"Rendering time cost: {exc:.2f} s", duration = 3)

	edit_config = set_all_use_to_false(edit_config)
	return video_dis

	def transfer_style(text, style_text, style_transfer_steps_end, opt, pipeline):
	global edit_config
	edit_config.style_tranfer.use = True
	edit_config.style_tranfer.style_transfer_steps_end = style_transfer_steps_end

	gr.Info("Loading Configurations...", duration = 3)
	model = build_models(opt, edit_config=edit_config)
	ckpt_path = pjoin(opt.model_dir, opt.which_ckpt + '.tar')
	niter = load_model_weights(model, ckpt_path, use_ema=not opt.no_ema)

	pipeline = DiffusePipeline(
	opt = opt,
	model = model,
	diffuser_name = opt.diffuser_name,
	device=opt.device,
	num_inference_steps=opt.num_inference_steps,
	torch_dtype=torch.float16,
	)

	print(edit_config)

	width = 500
	height = 500
	texts = [style_text, text, text]
	motion_lens = [opt.motion_length * opt.fps for _ in range(opt.num_samples)]

	save_dir = './tmp/gen/'
	filenames = [generate_md5(str(time.time())) + ".mp4", generate_md5(str(time.time())) + ".mp4", generate_md5(str(time.time())) + ".mp4"]
	save_paths = [pjoin(save_dir, str(filenames[0])), pjoin(save_dir, str(filenames[1])), pjoin(save_dir, str(filenames[2]))]
	os.makedirs(save_dir, exist_ok=True)

	start_time = time.perf_counter()
	gr.Info("Generating motion...", duration = 3)
	pred_motions, _ = pipeline.generate(texts, torch.LongTensor([int(x) for x in motion_lens]))
	end_time = time.perf_counter()
	exc = end_time - start_time
	gr.Info(f"Generating time cost: {exc:.2f} s, rendering starts...", duration = 3)
	start_time = time.perf_counter()
	mean = np.load(pjoin(opt.meta_dir, 'mean.npy'))
	std = np.load(pjoin(opt.meta_dir, 'std.npy'))

	samples = []

	root_list = []
	for i, motion in enumerate(pred_motions):
	motion = motion.cpu().numpy() * std + mean
	# 1. recover 3d joints representation by ik
	motion = recover_from_ric(torch.from_numpy(motion).float(), opt.joints_num)
	# 2. put on Floor (Y axis)
	floor_height = motion.min(dim=0)[0].min(dim=0)[0][1]
	motion[:, :, 1] -= floor_height
	motion = motion.numpy()
	# 3. remove jitter
	motion = motion_temporal_filter(motion, sigma=1)

	samples.append(motion)

	for i,title in enumerate(texts):
	title = texts[i]
	motion = samples[i]
	kinematic_tree = paramUtil.t2m_kinematic_chain if (opt.dataset_name == 't2m') else paramUtil.kit_kinematic_chain
	plot_3d_motion(save_paths[i], kinematic_tree, motion, title=title, fps=opt.fps, radius=opt.radius)

	gr.Info("Rendered motion...", duration = 3)
	end_time = time.perf_counter()
	exc = end_time - start_time
	gr.Info(f"Rendering time cost: {exc:.2f} s", duration = 3)

	video_dis0 = f"""<video controls playsinline width="{width}" style="display: block; margin: 0 auto;" src="./file={save_paths[0]}"></video> <br> <p align="center"> Style Reference </p>"""
	video_dis1 = f"""<video controls playsinline width="{width}" style="display: block; margin: 0 auto;" src="./file={save_paths[2]}"></video> <br> <p align="center"> Content Reference </p>"""
	video_dis2 = f"""<video controls playsinline width="{width}" style="display: block; margin: 0 auto;" src="./file={save_paths[1]}"></video> <br> <p align="center"> Transfered Result </p>"""

	edit_config = set_all_use_to_false(edit_config)
	return video_dis0, video_dis2


	@spaces.GPU
	def main():
	parser = GenerateOptions()
	opt = parser.parse_app()
	set_seed(opt.seed)
	device_id = opt.gpu_id
	device = torch.device('cuda:%d' % device_id if torch.cuda.is_available() else 'cpu')
	opt.device = device


	# load model
	model = build_models(opt, edit_config=edit_config)
	ckpt_path = pjoin(opt.model_dir, opt.which_ckpt + '.tar')
	niter = load_model_weights(model, ckpt_path, use_ema=not opt.no_ema)

	pipeline = DiffusePipeline(
	opt = opt,
	model = model,
	diffuser_name = opt.diffuser_name,
	device=device,
	num_inference_steps=opt.num_inference_steps,
	torch_dtype=torch.float16,
	)

	with gr.Blocks() as demo:
	gr.Markdown(HEAD)
	with gr.Row():
	with gr.Column(scale=7):
	text_input = gr.Textbox(label="Input the text prompt to generate motion...")
	with gr.Column(scale=3):
	sequence_length = gr.Slider(minimum=1, maximum=9.6, step=0.1, label="Motion length", value=8)
	with gr.Row():
	generate_button = gr.Button("Generate motion")

	with gr.Row():
	video_display = gr.HTML(label="生成的视频", visible=True)


	tabs = gr.Tabs(visible=True)
	with tabs:
	with gr.Tab("Motion (de-)emphasizing"):
	with gr.Row():
	int_input = gr.Number(label="Editing word index", minimum=0, maximum=70)
	weight_input = gr.Slider(minimum=-1, maximum=1, step=0.01, label="Input weight for (de-)emphasizing [-1, 1]", value=0)

	trim_button = gr.Button("Edit reweighting")

	with gr.Row():
	original_video1 = gr.HTML(label="before editing", visible=False)
	edited_video = gr.HTML(label="after editing")

	trim_button.click(
	fn=lambda x, int_input, weight_input : reweighting(x, int_input, weight_input, opt, pipeline),
	inputs=[text_input, int_input, weight_input],
	outputs=edited_video,
	)

	with gr.Tab("Example-based motion genration"):
	with gr.Row():
	with gr.Column(scale=4):
	chunk_size = gr.Number(minimum=10, maximum=20, step=10,label="Chunk size (#frames)", value=20)
	example_based_steps_end = gr.Number(minimum=0, maximum=9,label="Ending step of manipulation", value=6)
	with gr.Column(scale=3):
	temp_seed = gr.Number(label="Seed for random", value=200, minimum=0)
	temp_seed_bar = gr.Slider(minimum=0, maximum=100, step=1, label="Seed for random bar", value=15)
	with gr.Column(scale=3):
	num_motion = gr.Radio(choices=[4, 8, 12, 16, 24], value=8, label="Select number of motions")

	gen_button = gr.Button("Generate example-based motion")


	example_video_display = []
	for _ in range(6):
	with gr.Row():
	for _ in range(4):
	video = gr.HTML(label="Example-based motion", visible=True)
	example_video_display.append(video)

	gen_button.click(
	fn=lambda text, chunk_size, example_based_steps_end, temp_seed, temp_seed_bar, num_motion: generate_example_based_motion(text, chunk_size, example_based_steps_end, temp_seed, temp_seed_bar, num_motion, opt, pipeline),
	inputs=[text_input, chunk_size, example_based_steps_end, temp_seed, temp_seed_bar, num_motion],
	outputs=example_video_display
	)

	with gr.Tab("Style transfer"):
	with gr.Row():
	style_text = gr.Textbox(label="Reference prompt (e.g. 'a man walks.')", value="a man walks.")
	style_transfer_steps_end = gr.Number(label="The end step of diffusion (0~9)", minimum=0, maximum=9, value=5)

	style_transfer_button = gr.Button("Transfer style")

	with gr.Row():
	style_reference = gr.HTML(label="style reference")
	original_video4 = gr.HTML(label="before style transfer", visible=False)
	styled_video = gr.HTML(label="after style transfer")

	style_transfer_button.click(
	fn=lambda text, style_text, style_transfer_steps_end: transfer_style(text, style_text, style_transfer_steps_end, opt, pipeline),
	inputs=[text_input, style_text, style_transfer_steps_end],
	outputs=[style_reference, styled_video],
	)

	def update_motion_length(sequence_length):
	opt.motion_length = sequence_length

	def on_generate(text, length, pipeline):
	update_motion_length(length)
	return generate_video_from_text(text, opt, pipeline)


	generate_button.click(
	fn=lambda text, length: on_generate(text, length, pipeline),
	inputs=[text_input, sequence_length],
	outputs=[
	video_display,
	original_video1,
	original_video4,
	tabs,
	],
	show_progress=True
	)

	generate_button.click(
	fn=lambda: [gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)],
	inputs=None,
	outputs=[video_display, original_video1, original_video4]
	)

	demo.launch()


	if __name__ == '__main__':
	main()