{ "cells": [ { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading checkpoint from pretrained/VQVAE/net_last.pth\n", "loading transformer checkpoint from D:\\project\\faithfulpose\\T2M-GPT-main\\pretrained\\net_best_fid.pth\n", "[0.06827529 0.09180064 0.06120043 0.11549855 0.04500877 0.18981335\n", " 0.0412125 0.059935 0.18670731 0.09415893 0.04638923]\n", "tensor([[301, 423, 423, 423, 423, 423, 423, 423, 423, 423, 233, 301, 25, 154,\n", " 176, 44, 39, 239, 364, 468, 367, 147, 10, 234, 173, 173, 173, 361,\n", " 173]], device='cuda:0')\n" ] } ], "source": [ "\n", "# change the text here\n", "clip_text = [\"Walking forward in an even pace\"]\n", "clip_text = [\"Going ahead in an even pace\"]\n", "clip_text=['A human utilizes his right arm to help himself to stand up.']\n", "clip_text=['A native is stepping ahead briskly.']\n", "clip_text=['A native motions a quarter of a loop to the right.']\n", "import sys\n", "sys.argv = ['GPT_eval_multi.py']\n", "import options.option_transformer as option_trans\n", "args = option_trans.get_args_parser()\n", "\n", "args.dataname = 't2m'\n", "args.resume_pth = 'pretrained/VQVAE/net_last.pth'\n", "args.resume_trans = 'pretrained/net_best_fid.pth'\n", "args.clip_path = 'pretrained/clip_best.pth'\n", "args.down_t = 2\n", "args.depth = 3\n", "args.block_size = 51\n", "from PIL import Image\n", "\n", "from CLIP.clip import clip\n", "from CLIP.clip import model\n", "\n", "import torch\n", "import numpy as np\n", "import models.vqvae as vqvae\n", "import models.t2m_trans as trans\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "## load clip model and datasets\n", "clip_model, clip_preprocess = clip.load(args.clip_path, device=torch.device('cuda'), jit=False) # Must set jit=False for training\n", "# clip.model.convert_weights(clip_model) # Actually this line is unnecessary since clip by default already on float16\n", "clip_model.eval()\n", "for p in clip_model.parameters():\n", " p.requires_grad = False\n", "\n", "net = vqvae.HumanVQVAE(args, ## use args to define different parameters in different quantizers\n", " args.nb_code,\n", " args.code_dim,\n", " args.output_emb_width,\n", " args.down_t,\n", " args.stride_t,\n", " args.width,\n", " args.depth,\n", " args.dilation_growth_rate)\n", "\n", "\n", "trans_encoder = trans.Text2Motion_Transformer(num_vq=args.nb_code, \n", " embed_dim=1024, \n", " clip_dim=args.clip_dim, \n", " block_size=args.block_size, \n", " num_layers=9, \n", " n_head=16, \n", " drop_out_rate=args.drop_out_rate, \n", " fc_rate=args.ff_rate)\n", "\n", "\n", "print ('loading checkpoint from {}'.format(args.resume_pth))\n", "ckpt = torch.load(args.resume_pth, map_location='cpu')\n", "net.load_state_dict(ckpt['net'], strict=True)\n", "net.eval()\n", "net.cuda()\n", "\n", "print ('loading transformer checkpoint from {}'.format(args.resume_trans))\n", "ckpt = torch.load(args.resume_trans, map_location='cpu')\n", "trans_encoder.load_state_dict(ckpt['trans'], strict=True)\n", "trans_encoder.eval()\n", "trans_encoder.cuda()\n", "\n", "mean = torch.from_numpy(np.load('./checkpoints/t2m/VQVAEV3_CB1024_CMT_H1024_NRES3/meta/mean.npy')).cuda()\n", "std = torch.from_numpy(np.load('./checkpoints/t2m/VQVAEV3_CB1024_CMT_H1024_NRES3/meta/std.npy')).cuda()\n", "# use this code to calculate attention weights\n", "with torch.no_grad():\n", " # Encode and normalize the search query using CLIP\n", " text_token = clip.tokenize(clip_text[0]).cuda()\n", " text_encoded, weight = clip_model.encode_text(text_token)\n", " text_encoded /= text_encoded.norm(dim=-1, keepdim=True)\n", "tokens = clip_text[0].split(\" \")\n", "attention_weights = list(weight[-1][0][1+len(tokens)].cpu().numpy())[:2+len(tokens)][1:][:-1]\n", "attention_weights = [float(item) for item in attention_weights]\n", "attention_weights = np.array(attention_weights)\n", "normalized_attention = attention_weights / attention_weights.sum()\n", "# print(normalized_attention)\n", "text = clip.tokenize(clip_text).cuda()\n", "feat_clip_text = clip_model.encode_text(text)[0].float()\n", "index_motion = trans_encoder.sample(feat_clip_text[0:1], False)\n", "attentions = trans_encoder.sample(feat_clip_text[0:1],if_categorial=False,get_att=True)\n", "\n", "\n", "pred_pose = net.forward_decoder(index_motion)\n", "print(index_motion)\n", "from utils.motion_process import recover_from_ric\n", "pred_xyz = recover_from_ric((pred_pose*std+mean).float(), 22)\n", "xyz = pred_xyz.reshape(1, -1, 22, 3)\n", "\n", "np.save('motion.npy', xyz.detach().cpu().numpy())\n", "\n", "import visualization.plot_3d_global as plot_3d\n", "pose_vis = plot_3d.draw_to_batch(xyz.detach().cpu().numpy(),clip_text, [f'{clip_text[0]}.gif'])\n", "\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from IPython.display import HTML\n", "import base64\n", "b64 = base64.b64encode(open(f'{clip_text[0]}.gif','rb').read()).decode('ascii')\n", "display(HTML(f''))" ] } ], "metadata": { "kernelspec": { "display_name": "posescript", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 2 }