{
"cells": [
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading checkpoint from pretrained/VQVAE/net_last.pth\n",
"loading transformer checkpoint from D:\\project\\faithfulpose\\T2M-GPT-main\\pretrained\\net_best_fid.pth\n",
"[0.06827529 0.09180064 0.06120043 0.11549855 0.04500877 0.18981335\n",
" 0.0412125 0.059935 0.18670731 0.09415893 0.04638923]\n",
"tensor([[301, 423, 423, 423, 423, 423, 423, 423, 423, 423, 233, 301, 25, 154,\n",
" 176, 44, 39, 239, 364, 468, 367, 147, 10, 234, 173, 173, 173, 361,\n",
" 173]], device='cuda:0')\n"
]
}
],
"source": [
"\n",
"# change the text here\n",
"clip_text = [\"Walking forward in an even pace\"]\n",
"clip_text = [\"Going ahead in an even pace\"]\n",
"clip_text=['A human utilizes his right arm to help himself to stand up.']\n",
"clip_text=['A native is stepping ahead briskly.']\n",
"clip_text=['A native motions a quarter of a loop to the right.']\n",
"import sys\n",
"sys.argv = ['GPT_eval_multi.py']\n",
"import options.option_transformer as option_trans\n",
"args = option_trans.get_args_parser()\n",
"\n",
"args.dataname = 't2m'\n",
"args.resume_pth = 'pretrained/VQVAE/net_last.pth'\n",
"args.resume_trans = 'pretrained/net_best_fid.pth'\n",
"args.clip_path = 'pretrained/clip_best.pth'\n",
"args.down_t = 2\n",
"args.depth = 3\n",
"args.block_size = 51\n",
"from PIL import Image\n",
"\n",
"from CLIP.clip import clip\n",
"from CLIP.clip import model\n",
"\n",
"import torch\n",
"import numpy as np\n",
"import models.vqvae as vqvae\n",
"import models.t2m_trans as trans\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"## load clip model and datasets\n",
"clip_model, clip_preprocess = clip.load(args.clip_path, device=torch.device('cuda'), jit=False) # Must set jit=False for training\n",
"# clip.model.convert_weights(clip_model) # Actually this line is unnecessary since clip by default already on float16\n",
"clip_model.eval()\n",
"for p in clip_model.parameters():\n",
" p.requires_grad = False\n",
"\n",
"net = vqvae.HumanVQVAE(args, ## use args to define different parameters in different quantizers\n",
" args.nb_code,\n",
" args.code_dim,\n",
" args.output_emb_width,\n",
" args.down_t,\n",
" args.stride_t,\n",
" args.width,\n",
" args.depth,\n",
" args.dilation_growth_rate)\n",
"\n",
"\n",
"trans_encoder = trans.Text2Motion_Transformer(num_vq=args.nb_code, \n",
" embed_dim=1024, \n",
" clip_dim=args.clip_dim, \n",
" block_size=args.block_size, \n",
" num_layers=9, \n",
" n_head=16, \n",
" drop_out_rate=args.drop_out_rate, \n",
" fc_rate=args.ff_rate)\n",
"\n",
"\n",
"print ('loading checkpoint from {}'.format(args.resume_pth))\n",
"ckpt = torch.load(args.resume_pth, map_location='cpu')\n",
"net.load_state_dict(ckpt['net'], strict=True)\n",
"net.eval()\n",
"net.cuda()\n",
"\n",
"print ('loading transformer checkpoint from {}'.format(args.resume_trans))\n",
"ckpt = torch.load(args.resume_trans, map_location='cpu')\n",
"trans_encoder.load_state_dict(ckpt['trans'], strict=True)\n",
"trans_encoder.eval()\n",
"trans_encoder.cuda()\n",
"\n",
"mean = torch.from_numpy(np.load('./checkpoints/t2m/VQVAEV3_CB1024_CMT_H1024_NRES3/meta/mean.npy')).cuda()\n",
"std = torch.from_numpy(np.load('./checkpoints/t2m/VQVAEV3_CB1024_CMT_H1024_NRES3/meta/std.npy')).cuda()\n",
"# use this code to calculate attention weights\n",
"with torch.no_grad():\n",
" # Encode and normalize the search query using CLIP\n",
" text_token = clip.tokenize(clip_text[0]).cuda()\n",
" text_encoded, weight = clip_model.encode_text(text_token)\n",
" text_encoded /= text_encoded.norm(dim=-1, keepdim=True)\n",
"tokens = clip_text[0].split(\" \")\n",
"attention_weights = list(weight[-1][0][1+len(tokens)].cpu().numpy())[:2+len(tokens)][1:][:-1]\n",
"attention_weights = [float(item) for item in attention_weights]\n",
"attention_weights = np.array(attention_weights)\n",
"normalized_attention = attention_weights / attention_weights.sum()\n",
"# print(normalized_attention)\n",
"text = clip.tokenize(clip_text).cuda()\n",
"feat_clip_text = clip_model.encode_text(text)[0].float()\n",
"index_motion = trans_encoder.sample(feat_clip_text[0:1], False)\n",
"attentions = trans_encoder.sample(feat_clip_text[0:1],if_categorial=False,get_att=True)\n",
"\n",
"\n",
"pred_pose = net.forward_decoder(index_motion)\n",
"print(index_motion)\n",
"from utils.motion_process import recover_from_ric\n",
"pred_xyz = recover_from_ric((pred_pose*std+mean).float(), 22)\n",
"xyz = pred_xyz.reshape(1, -1, 22, 3)\n",
"\n",
"np.save('motion.npy', xyz.detach().cpu().numpy())\n",
"\n",
"import visualization.plot_3d_global as plot_3d\n",
"pose_vis = plot_3d.draw_to_batch(xyz.detach().cpu().numpy(),clip_text, [f'{clip_text[0]}.gif'])\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from IPython.display import HTML\n",
"import base64\n",
"b64 = base64.b64encode(open(f'{clip_text[0]}.gif','rb').read()).decode('ascii')\n",
"display(HTML(f''))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "posescript",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}