Stable-text-to-motion

Runtime error

App Files Files Community

Stable-text-to-motion / app.py

vumichien

Update app.py

b6952b3 about 2 years ago

raw

history blame

11.4 kB

	import sys
	import os
	import OpenGL.GL as gl
	os.environ["PYOPENGL_PLATFORM"] = "egl"
	os.environ["MESA_GL_VERSION_OVERRIDE"] = "4.1"

	sys.argv = ['VQ-Trans/GPT_eval_multi.py']
	os.chdir('VQ-Trans')

	sys.path.append('/home/user/app/VQ-Trans')
	sys.path.append('/home/user/app/VQ-Trans/pyrender')

	import options.option_transformer as option_trans
	from huggingface_hub import snapshot_download
	model_path = snapshot_download(repo_id="vumichien/T2M-GPT")

	args = option_trans.get_args_parser()

	args.dataname = 't2m'
	args.resume_pth = f'{model_path}/VQVAE/net_last.pth'
	args.resume_trans = f'{model_path}/VQTransformer_corruption05/net_best_fid.pth'
	args.down_t = 2
	args.depth = 3
	args.block_size = 51

	import clip
	import torch
	import numpy as np
	import models.vqvae as vqvae
	import models.t2m_trans as trans
	from utils.motion_process import recover_from_ric
	import visualization.plot_3d_global as plot_3d
	from models.rotation2xyz import Rotation2xyz
	import numpy as np
	from trimesh import Trimesh
	import gc

	import torch
	from visualize.simplify_loc2rot import joints2smpl
	import pyrender
	# import matplotlib.pyplot as plt

	import io
	import imageio
	from shapely import geometry
	import trimesh
	from pyrender.constants import RenderFlags
	import math
	# import ffmpeg
	# from PIL import Image
	import hashlib
	import gradio as gr

	## load clip model and datasets
	is_cuda = torch.cuda.is_available()
	device = torch.device("cuda" if is_cuda else "cpu")
	print(device)
	clip_model, clip_preprocess = clip.load("ViT-B/32", device=device, jit=False, download_root='./') # Must set jit=False for training

	if is_cuda:
	clip.model.convert_weights(clip_model)

	clip_model.eval()
	for p in clip_model.parameters():
	p.requires_grad = False

	net = vqvae.HumanVQVAE(args, ## use args to define different parameters in different quantizers
	args.nb_code,
	args.code_dim,
	args.output_emb_width,
	args.down_t,
	args.stride_t,
	args.width,
	args.depth,
	args.dilation_growth_rate)


	trans_encoder = trans.Text2Motion_Transformer(num_vq=args.nb_code,
	embed_dim=1024,
	clip_dim=args.clip_dim,
	block_size=args.block_size,
	num_layers=9,
	n_head=16,
	drop_out_rate=args.drop_out_rate,
	fc_rate=args.ff_rate)


	print('loading checkpoint from {}'.format(args.resume_pth))
	ckpt = torch.load(args.resume_pth, map_location='cpu')
	net.load_state_dict(ckpt['net'], strict=True)
	net.eval()

	print('loading transformer checkpoint from {}'.format(args.resume_trans))
	ckpt = torch.load(args.resume_trans, map_location='cpu')
	trans_encoder.load_state_dict(ckpt['trans'], strict=True)
	trans_encoder.eval()

	mean = torch.from_numpy(np.load(f'{model_path}/meta/mean.npy'))
	std = torch.from_numpy(np.load(f'{model_path}/meta/std.npy'))

	if is_cuda:
	net.cuda()
	trans_encoder.cuda()
	mean = mean.cuda()
	std = std.cuda()

	def render(motions, device_id=0, name='test_vis'):
	frames, njoints, nfeats = motions.shape
	MINS = motions.min(axis=0).min(axis=0)
	MAXS = motions.max(axis=0).max(axis=0)

	height_offset = MINS[1]
	motions[:, :, 1] -= height_offset
	trajec = motions[:, 0, [0, 2]]
	is_cuda = torch.cuda.is_available()
	# device = torch.device("cuda" if is_cuda else "cpu")
	j2s = joints2smpl(num_frames=frames, device_id=0, cuda=is_cuda)
	rot2xyz = Rotation2xyz(device=device)
	faces = rot2xyz.smpl_model.faces

	if not os.path.exists(f'output/{name}_pred.pt'):
	print(f'Running SMPLify, it may take a few minutes.')
	motion_tensor, opt_dict = j2s.joint2smpl(motions) # [nframes, njoints, 3]

	vertices = rot2xyz(torch.tensor(motion_tensor).clone(), mask=None,
	pose_rep='rot6d', translation=True, glob=True,
	jointstype='vertices',
	vertstrans=True)
	vertices = vertices.detach().cpu()
	torch.save(vertices, f'output/{name}_pred.pt')
	else:
	vertices = torch.load(f'output/{name}_pred.pt')
	frames = vertices.shape[3] # shape: 1, nb_frames, 3, nb_joints
	print(vertices.shape)
	MINS = torch.min(torch.min(vertices[0], axis=0)[0], axis=1)[0]
	MAXS = torch.max(torch.max(vertices[0], axis=0)[0], axis=1)[0]

	out_list = []

	minx = MINS[0] - 0.5
	maxx = MAXS[0] + 0.5
	minz = MINS[2] - 0.5
	maxz = MAXS[2] + 0.5
	polygon = geometry.Polygon([[minx, minz], [minx, maxz], [maxx, maxz], [maxx, minz]])
	polygon_mesh = trimesh.creation.extrude_polygon(polygon, 1e-5)

	vid = []
	for i in range(frames):
	if i % 10 == 0:
	print(i)

	mesh = Trimesh(vertices=vertices[0, :, :, i].squeeze().tolist(), faces=faces)

	base_color = (0.11, 0.53, 0.8, 0.5)
	## OPAQUE rendering without alpha
	## BLEND rendering consider alpha
	material = pyrender.MetallicRoughnessMaterial(
	metallicFactor=0.7,
	alphaMode='OPAQUE',
	baseColorFactor=base_color
	)


	mesh = pyrender.Mesh.from_trimesh(mesh, material=material)

	polygon_mesh.visual.face_colors = [0, 0, 0, 0.21]
	polygon_render = pyrender.Mesh.from_trimesh(polygon_mesh, smooth=False)

	bg_color = [1, 1, 1, 0.8]
	scene = pyrender.Scene(bg_color=bg_color, ambient_light=(0.4, 0.4, 0.4))

	sx, sy, tx, ty = [0.75, 0.75, 0, 0.10]

	camera = pyrender.PerspectiveCamera(yfov=(np.pi / 3.0))

	light = pyrender.DirectionalLight(color=[1,1,1], intensity=300)

	scene.add(mesh)

	c = np.pi / 2

	scene.add(polygon_render, pose=np.array([[ 1, 0, 0, 0],

	[ 0, np.cos(c), -np.sin(c), MINS[1].cpu().numpy()],

	[ 0, np.sin(c), np.cos(c), 0],

	[ 0, 0, 0, 1]]))

	light_pose = np.eye(4)
	light_pose[:3, 3] = [0, -1, 1]
	scene.add(light, pose=light_pose.copy())

	light_pose[:3, 3] = [0, 1, 1]
	scene.add(light, pose=light_pose.copy())

	light_pose[:3, 3] = [1, 1, 2]
	scene.add(light, pose=light_pose.copy())


	c = -np.pi / 6

	scene.add(camera, pose=[[ 1, 0, 0, (minx+maxx).cpu().numpy()/2],

	[ 0, np.cos(c), -np.sin(c), 1.5],

	[ 0, np.sin(c), np.cos(c), max(4, minz.cpu().numpy()+(1.5-MINS[1].cpu().numpy())*2, (maxx-minx).cpu().numpy())],

	[ 0, 0, 0, 1]
	])

	# render scene
	r = pyrender.OffscreenRenderer(960, 960)

	color, _ = r.render(scene, flags=RenderFlags.RGBA)
	# Image.fromarray(color).save(outdir+'/'+name+'_'+str(i)+'.png')

	vid.append(color)

	r.delete()

	out = np.stack(vid, axis=0)
	imageio.mimwrite(f'output/results.gif', out, fps=20)
	del out, vertices
	return f'output/results.gif'

	def predict(clip_text, method='fast'):
	gc.collect()
	if torch.cuda.is_available():
	text = clip.tokenize([clip_text], truncate=True).cuda()
	else:
	text = clip.tokenize([clip_text], truncate=True)
	feat_clip_text = clip_model.encode_text(text).float()
	index_motion = trans_encoder.sample(feat_clip_text[0:1], False)
	pred_pose = net.forward_decoder(index_motion)
	pred_xyz = recover_from_ric((pred_pose*std+mean).float(), 22)
	output_name = hashlib.md5(clip_text.encode()).hexdigest()
	if method == 'fast':
	xyz = pred_xyz.reshape(1, -1, 22, 3)
	pose_vis = plot_3d.draw_to_batch(xyz.detach().cpu().numpy(), title_batch=None, outname=[f'output/results.gif'])
	return f'output/results.gif'
	elif method == 'slow':
	output_path = render(pred_xyz.detach().cpu().numpy().squeeze(axis=0), device_id=0, name=output_name)
	return output_path


	# ---- Gradio Layout -----
	text_prompt = gr.Textbox(label="Text prompt", lines=1, interactive=True)
	video_out = gr.Video(label="Motion", mirror_webcam=False, interactive=False)
	demo = gr.Blocks()
	demo.encrypt = False

	with demo:
	gr.Markdown('''
	<div>
	<h1 style='text-align: center'>Generating Human Motion from Textual Descriptions with Discrete Representations (T2M-GPT)</h1>
	This space uses <a href='https://mael-zys.github.io/T2M-GPT/' target='_blank'><b>T2M-GPT models</b></a> based on Vector Quantised-Variational AutoEncoder (VQ-VAE) and Generative Pre-trained Transformer (GPT) for human motion generation from textural descriptions🤗
	</div>
	''')
	with gr.Row():
	with gr.Column():
	gr.Markdown('''
	<figure>
	<img src="https://huggingface.co/vumichien/T2M-GPT/resolve/main/demo_slow1.gif" alt="Demo Slow", width="425", height=480/>
	<figcaption> a man starts off in an up right position with botg arms extended out by his sides, he then brings his arms down to his body and claps his hands together. after this he wals down amd the the left where he proceeds to sit on a seat
	</figcaption>
	</figure>
	''')
	with gr.Column():
	gr.Markdown('''
	<figure>
	<img src="https://huggingface.co/vumichien/T2M-GPT/resolve/main/demo_slow2.gif" alt="Demo Slow 2", width="425", height=480/>
	<figcaption> a person puts their hands together, leans forwards slightly then swings the arms from right to left
	</figcaption>
	</figure>
	''')
	with gr.Column():
	gr.Markdown('''
	<figure>
	<img src="https://huggingface.co/vumichien/T2M-GPT/resolve/main/demo_slow3.gif" alt="Demo Slow 3", width="425", height=480/>
	<figcaption> a man is practicing the waltz with a partner
	</figcaption>
	</figure>
	''')
	with gr.Row():
	with gr.Column():
	gr.Markdown('''
	### Generate human motion by T2M-GPT
	##### Step 1. Give prompt text describing human motion
	##### Step 2. Choice method to generate output (Fast: Sketch skeleton; Slow: SMPL mesh)
	##### Step 3. Generate output and enjoy
	''')
	with gr.Row():
	gr.Markdown('''
	### You can test by following examples:
	''')
	examples = gr.Examples(examples=
	[ "a person jogs in place, slowly at first, then increases speed. they then back up and squat down.",
	"a man steps forward and does a handstand",
	"a man rises from the ground, walks in a circle and sits back down on the ground"],
	label="Examples", inputs=[text_prompt])

	with gr.Column():
	with gr.Row():
	text_prompt.render()
	method = gr.Dropdown(["slow", "fast"], label="Method", value="fast")
	with gr.Row():
	generate_btn = gr.Button("Generate")
	generate_btn.click(predict, [text_prompt, method], [video_out])
	print(video_out)
	with gr.Row():
	video_out.render()

	demo.launch(debug=True)