diff --git "a/quickstart.ipynb" "b/quickstart.ipynb" new file mode 100644--- /dev/null +++ "b/quickstart.ipynb" @@ -0,0 +1,174 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loading checkpoint from pretrained/VQVAE/net_last.pth\n", + "loading transformer checkpoint from D:\\project\\faithfulpose\\T2M-GPT-main\\pretrained\\net_best_fid.pth\n", + "[0.06827529 0.09180064 0.06120043 0.11549855 0.04500877 0.18981335\n", + " 0.0412125 0.059935 0.18670731 0.09415893 0.04638923]\n", + "tensor([[301, 423, 423, 423, 423, 423, 423, 423, 423, 423, 233, 301, 25, 154,\n", + " 176, 44, 39, 239, 364, 468, 367, 147, 10, 234, 173, 173, 173, 361,\n", + " 173]], device='cuda:0')\n" + ] + } + ], + "source": [ + "\n", + "# change the text here\n", + "clip_text = [\"Walking forward in an even pace\"]\n", + "clip_text = [\"Going ahead in an even pace\"]\n", + "clip_text=['A human utilizes his right arm to help himself to stand up.']\n", + "clip_text=['A native is stepping ahead briskly.']\n", + "clip_text=['A native motions a quarter of a loop to the right.']\n", + "import sys\n", + "sys.argv = ['GPT_eval_multi.py']\n", + "import options.option_transformer as option_trans\n", + "args = option_trans.get_args_parser()\n", + "\n", + "args.dataname = 't2m'\n", + "args.resume_pth = 'pretrained/VQVAE/net_last.pth'\n", + "args.resume_trans = 'pretrained/net_best_fid.pth'\n", + "args.clip_path = 'pretrained/clip_best.pth'\n", + "args.down_t = 2\n", + "args.depth = 3\n", + "args.block_size = 51\n", + "from PIL import Image\n", + "\n", + "from CLIP.clip import clip\n", + "from CLIP.clip import model\n", + "\n", + "import torch\n", + "import numpy as np\n", + "import models.vqvae as vqvae\n", + "import models.t2m_trans as trans\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "## load clip model and datasets\n", + "clip_model, clip_preprocess = clip.load(args.clip_path, device=torch.device('cuda'), jit=False) # Must set jit=False for training\n", + "# clip.model.convert_weights(clip_model) # Actually this line is unnecessary since clip by default already on float16\n", + "clip_model.eval()\n", + "for p in clip_model.parameters():\n", + " p.requires_grad = False\n", + "\n", + "net = vqvae.HumanVQVAE(args, ## use args to define different parameters in different quantizers\n", + " args.nb_code,\n", + " args.code_dim,\n", + " args.output_emb_width,\n", + " args.down_t,\n", + " args.stride_t,\n", + " args.width,\n", + " args.depth,\n", + " args.dilation_growth_rate)\n", + "\n", + "\n", + "trans_encoder = trans.Text2Motion_Transformer(num_vq=args.nb_code, \n", + " embed_dim=1024, \n", + " clip_dim=args.clip_dim, \n", + " block_size=args.block_size, \n", + " num_layers=9, \n", + " n_head=16, \n", + " drop_out_rate=args.drop_out_rate, \n", + " fc_rate=args.ff_rate)\n", + "\n", + "\n", + "print ('loading checkpoint from {}'.format(args.resume_pth))\n", + "ckpt = torch.load(args.resume_pth, map_location='cpu')\n", + "net.load_state_dict(ckpt['net'], strict=True)\n", + "net.eval()\n", + "net.cuda()\n", + "\n", + "print ('loading transformer checkpoint from {}'.format(args.resume_trans))\n", + "ckpt = torch.load(args.resume_trans, map_location='cpu')\n", + "trans_encoder.load_state_dict(ckpt['trans'], strict=True)\n", + "trans_encoder.eval()\n", + "trans_encoder.cuda()\n", + "\n", + "mean = torch.from_numpy(np.load('./checkpoints/t2m/VQVAEV3_CB1024_CMT_H1024_NRES3/meta/mean.npy')).cuda()\n", + "std = torch.from_numpy(np.load('./checkpoints/t2m/VQVAEV3_CB1024_CMT_H1024_NRES3/meta/std.npy')).cuda()\n", + "# use this code to calculate attention weights\n", + "with torch.no_grad():\n", + " # Encode and normalize the search query using CLIP\n", + " text_token = clip.tokenize(clip_text[0]).cuda()\n", + " text_encoded, weight = clip_model.encode_text(text_token)\n", + " text_encoded /= text_encoded.norm(dim=-1, keepdim=True)\n", + "tokens = clip_text[0].split(\" \")\n", + "attention_weights = list(weight[-1][0][1+len(tokens)].cpu().numpy())[:2+len(tokens)][1:][:-1]\n", + "attention_weights = [float(item) for item in attention_weights]\n", + "attention_weights = np.array(attention_weights)\n", + "normalized_attention = attention_weights / attention_weights.sum()\n", + "# print(normalized_attention)\n", + "text = clip.tokenize(clip_text).cuda()\n", + "feat_clip_text = clip_model.encode_text(text)[0].float()\n", + "index_motion = trans_encoder.sample(feat_clip_text[0:1], False)\n", + "attentions = trans_encoder.sample(feat_clip_text[0:1],if_categorial=False,get_att=True)\n", + "\n", + "\n", + "pred_pose = net.forward_decoder(index_motion)\n", + "print(index_motion)\n", + "from utils.motion_process import recover_from_ric\n", + "pred_xyz = recover_from_ric((pred_pose*std+mean).float(), 22)\n", + "xyz = pred_xyz.reshape(1, -1, 22, 3)\n", + "\n", + "np.save('motion.npy', xyz.detach().cpu().numpy())\n", + "\n", + "import visualization.plot_3d_global as plot_3d\n", + "pose_vis = plot_3d.draw_to_batch(xyz.detach().cpu().numpy(),clip_text, [f'{clip_text[0]}.gif'])\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from IPython.display import HTML\n", + "import base64\n", + "b64 = base64.b64encode(open(f'{clip_text[0]}.gif','rb').read()).decode('ascii')\n", + "display(HTML(f''))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "posescript", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}