Spaces:

samadi10
/

MMM-Demo

Runtime error

App Files Files Community

samadi10 commited on Jun 5, 2024

Commit

eeaa83d

1 Parent(s): 2c84e5f

Added necessary files

Browse files

Files changed (35) hide show

.gitignore +1 -0
GPT_eval_multi.py +141 -0
LICENSE-CC-BY-NC-ND-4.0.md +157 -0
dataset/dataset_TM_eval.py +218 -0
dataset/dataset_TM_train.py +173 -0
dataset/dataset_VQ.py +109 -0
dataset/dataset_tokenize.py +125 -0
environment.yml +227 -0
exit/t2m-mean.npy +0 -0
exit/t2m-std.npy +0 -0
exit/utils.py +305 -0
generate.py +297 -0
models/encdec.py +76 -0
models/evaluator_wrapper.py +92 -0
models/modules.py +109 -0
models/pos_encoding.py +43 -0
models/quantize_cnn.py +423 -0
models/resnet.py +82 -0
models/t2m_trans.py +626 -0
models/t2m_trans_uplow.py +637 -0
models/vqvae.py +162 -0
models/vqvae_sep.py +257 -0
options/get_eval_option.py +83 -0
options/option_transformer.py +72 -0
options/option_vq.py +62 -0
train_t2m_trans.py +265 -0
train_vq.py +194 -0
utils/eval_trans.py +824 -0
utils/humanml_utils.py +68 -0
utils/losses.py +30 -0
utils/motion_process.py +59 -0
utils/paramUtil.py +63 -0
utils/quaternion.py +423 -0
utils/utils_model.py +66 -0
utils/word_vectorizer.py +99 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ /.DS_STORE

GPT_eval_multi.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import os
+import torch
+import numpy as np
+from torch.utils.tensorboard import SummaryWriter
+import json
+import clip
+import options.option_transformer as option_trans
+import models.vqvae as vqvae
+import utils.utils_model as utils_model
+import utils.eval_trans as eval_trans
+from dataset import dataset_TM_eval
+import models.t2m_trans as trans
+from options.get_eval_option import get_opt
+from models.evaluator_wrapper import EvaluatorModelWrapper
+import warnings
+warnings.filterwarnings('ignore')
+from exit.utils import base_dir, init_save_folder
+##### ---- Exp dirs ---- #####
+args = option_trans.get_args_parser()
+torch.manual_seed(args.seed)
+args.out_dir = f'{args.out_dir}/eval'
+os.makedirs(args.out_dir, exist_ok = True)
+init_save_folder(args)
+##### ---- Logger ---- #####
+logger = utils_model.get_logger(args.out_dir)
+writer = SummaryWriter(args.out_dir)
+logger.info(json.dumps(vars(args), indent=4, sort_keys=True))
+from utils.word_vectorizer import WordVectorizer
+w_vectorizer = WordVectorizer('./glove', 'our_vab')
+val_loader = dataset_TM_eval.DATALoader(args.dataname, True, 32, w_vectorizer)
+dataset_opt_path = 'checkpoints/kit/Comp_v6_KLD005/opt.txt' if args.dataname == 'kit' else 'checkpoints/t2m/Comp_v6_KLD005/opt.txt'
+wrapper_opt = get_opt(dataset_opt_path, torch.device('cuda'))
+eval_wrapper = EvaluatorModelWrapper(wrapper_opt)
+##### ---- Network ---- #####
+clip_model, clip_preprocess = clip.load("ViT-B/32", device=torch.device('cuda'), jit=False)  # Must set jit=False for training
+clip.model.convert_weights(clip_model)  # Actually this line is unnecessary since clip by default already on float16
+clip_model.eval()
+for p in clip_model.parameters():
+    p.requires_grad = False
+# https://github.com/openai/CLIP/issues/111
+class TextCLIP(torch.nn.Module):
+    def __init__(self, model) :
+        super(TextCLIP, self).__init__()
+        self.model = model
+    def forward(self,text):
+        with torch.no_grad():
+            word_emb = self.model.token_embedding(text).type(self.model.dtype)
+            word_emb = word_emb + self.model.positional_embedding.type(self.model.dtype)
+            word_emb = word_emb.permute(1, 0, 2)  # NLD -> LND
+            word_emb = self.model.transformer(word_emb)
+            word_emb = self.model.ln_final(word_emb).permute(1, 0, 2).float()
+            enctxt = self.model.encode_text(text).float()
+        return enctxt, word_emb
+clip_model = TextCLIP(clip_model)
+net = vqvae.HumanVQVAE(args, ## use args to define different parameters in different quantizers
+                       args.nb_code,
+                       args.code_dim,
+                       args.output_emb_width,
+                       args.down_t,
+                       args.stride_t,
+                       args.width,
+                       args.depth,
+                       args.dilation_growth_rate)
+trans_encoder = trans.Text2Motion_Transformer(net,
+                                num_vq=args.nb_code,
+                                embed_dim=args.embed_dim_gpt,
+                                clip_dim=args.clip_dim,
+                                block_size=args.block_size,
+                                num_layers=args.num_layers,
+                                num_local_layer=args.num_local_layer,
+                                n_head=args.n_head_gpt,
+                                drop_out_rate=args.drop_out_rate,
+                                fc_rate=args.ff_rate)
+print ('loading checkpoint from {}'.format(args.resume_pth))
+ckpt = torch.load(args.resume_pth, map_location='cpu')
+net.load_state_dict(ckpt['net'], strict=True)
+net.eval()
+net.cuda()
+if args.resume_trans is not None:
+    print ('loading transformer checkpoint from {}'.format(args.resume_trans))
+    ckpt = torch.load(args.resume_trans, map_location='cpu')
+    trans_encoder.load_state_dict(ckpt['trans'], strict=True)
+trans_encoder.train()
+trans_encoder.cuda()
+fid = []
+div = []
+top1 = []
+top2 = []
+top3 = []
+matching = []
+multi = []
+repeat_time = 20
+from tqdm import tqdm
+for i in tqdm(range(repeat_time)):
+    pred_pose_eval, pose, m_length, clip_text, \
+    best_fid, best_iter, best_div, best_top1, best_top2, best_top3, best_matching, best_multi, writer, logger = eval_trans.evaluation_transformer(args.out_dir, val_loader, net, trans_encoder, logger, writer, 0, best_fid=1000, best_iter=0, best_div=100, best_top1=0, best_top2=0, best_top3=0, best_matching=100, clip_model=clip_model, eval_wrapper=eval_wrapper, dataname=args.dataname, save = False, num_repeat=11, rand_pos=True)
+    fid.append(best_fid)
+    div.append(best_div)
+    top1.append(best_top1)
+    top2.append(best_top2)
+    top3.append(best_top3)
+    matching.append(best_matching)
+    multi.append(best_multi)
+print('final result:')
+print('fid: ', sum(fid)/repeat_time)
+print('div: ', sum(div)/repeat_time)
+print('top1: ', sum(top1)/repeat_time)
+print('top2: ', sum(top2)/repeat_time)
+print('top3: ', sum(top3)/repeat_time)
+print('matching: ', sum(matching)/repeat_time)
+print('multi: ', sum(multi)/repeat_time)
+fid = np.array(fid)
+div = np.array(div)
+top1 = np.array(top1)
+top2 = np.array(top2)
+top3 = np.array(top3)
+matching = np.array(matching)
+multi = np.array(multi)
+msg_final = f"FID. {np.mean(fid):.3f}, conf. {np.std(fid)*1.96/np.sqrt(repeat_time):.3f}, Diversity. {np.mean(div):.3f}, conf. {np.std(div)*1.96/np.sqrt(repeat_time):.3f}, TOP1. {np.mean(top1):.3f}, conf. {np.std(top1)*1.96/np.sqrt(repeat_time):.3f}, TOP2. {np.mean(top2):.3f}, conf. {np.std(top2)*1.96/np.sqrt(repeat_time):.3f}, TOP3. {np.mean(top3):.3f}, conf. {np.std(top3)*1.96/np.sqrt(repeat_time):.3f}, Matching. {np.mean(matching):.3f}, conf. {np.std(matching)*1.96/np.sqrt(repeat_time):.3f}, Multi. {np.mean(multi):.3f}, conf. {np.std(multi)*1.96/np.sqrt(repeat_time):.3f}"
+logger.info(msg_final)

LICENSE-CC-BY-NC-ND-4.0.md ADDED Viewed

	@@ -0,0 +1,157 @@

+# Attribution-NonCommercial-NoDerivatives 4.0 International
+> *Creative Commons Corporation (“Creative Commons”) is not a law firm and does not provide legal services or legal advice. Distribution of Creative Commons public licenses does not create a lawyer-client or other relationship. Creative Commons makes its licenses and related information available on an “as-is” basis. Creative Commons gives no warranties regarding its licenses, any material licensed under their terms and conditions, or any related information. Creative Commons disclaims all liability for damages resulting from their use to the fullest extent possible.*
+>
+> ### Using Creative Commons Public Licenses
+>
+> Creative Commons public licenses provide a standard set of terms and conditions that creators and other rights holders may use to share original works of authorship and other material subject to copyright and certain other rights specified in the public license below. The following considerations are for informational purposes only, are not exhaustive, and do not form part of our licenses.
+>
+> * __Considerations for licensors:__ Our public licenses are intended for use by those authorized to give the public permission to use material in ways otherwise restricted by copyright and certain other rights. Our licenses are irrevocable. Licensors should read and understand the terms and conditions of the license they choose before applying it. Licensors should also secure all rights necessary before applying our licenses so that the public can reuse the material as expected. Licensors should clearly mark any material not subject to the license. This includes other CC-licensed material, or material used under an exception or limitation to copyright. [More considerations for licensors](http://wiki.creativecommons.org/Considerations_for_licensors_and_licensees#Considerations_for_licensors).
+>
+> * __Considerations for the public:__ By using one of our public licenses, a licensor grants the public permission to use the licensed material under specified terms and conditions. If the licensor’s permission is not necessary for any reason–for example, because of any applicable exception or limitation to copyright–then that use is not regulated by the license. Our licenses grant only permissions under copyright and certain other rights that a licensor has authority to grant. Use of the licensed material may still be restricted for other reasons, including because others have copyright or other rights in the material. A licensor may make special requests, such as asking that all changes be marked or described. Although not required by our licenses, you are encouraged to respect those requests where reasonable. [More considerations for the public](http://wiki.creativecommons.org/Considerations_for_licensors_and_licensees#Considerations_for_licensees).
+## Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International Public License
+By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.
+### Section 1 – Definitions.
+a. __Adapted Material__ means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image.
+b. __Copyright and Similar Rights__ means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights.
+e. __Effective Technological Measures__ means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements.
+f. __Exceptions and Limitations__ means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material.
+h. __Licensed Material__ means the artistic or literary work, database, or other material to which the Licensor applied this Public License.
+i. __Licensed Rights__ means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license.
+h. __Licensor__ means the individual(s) or entity(ies) granting rights under this Public License.
+i. __NonCommercial__ means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange.
+j. __Share__ means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them.
+k. __Sui Generis Database Rights__ means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world.
+l. __You__ means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning.
+### Section 2 – Scope.
+a. ___License grant.___
+   1. Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to:
+        A. reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and
+        B. produce and reproduce, but not Share, Adapted Material for NonCommercial purposes only.
+   2. __Exceptions and Limitations.__ For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions.
+   3. __Term.__ The term of this Public License is specified in Section 6(a).
+   4. __Media and formats; technical modifications allowed.__ The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material.
+   5. __Downstream recipients.__
+        A. __Offer from the Licensor – Licensed Material.__ Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License.
+        B. __No downstream restrictions.__ You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material.
+   6. __No endorsement.__ Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i).
+b. ___Other rights.___
+   1. Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise.
+   2. Patent and trademark rights are not licensed under this Public License.
+   3. To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes.
+### Section 3 – License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the following conditions.
+a. ___Attribution.___
+   1. If You Share the Licensed Material, You must:
+      A. retain the following if it is supplied by the Licensor with the Licensed Material:
+         i. identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated);
+         ii. a copyright notice;
+         iii. a notice that refers to this Public License;
+         iv. a notice that refers to the disclaimer of warranties;
+         v. a URI or hyperlink to the Licensed Material to the extent reasonably practicable;
+      B. indicate if You modified the Licensed Material and retain an indication of any previous modifications; and
+      C. indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License.
+        For the avoidance of doubt, You do not have permission under this Public License to Share Adapted Material.
+   2. You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information.
+   3. If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable.
+### Section 4 – Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material:
+a. for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only and provided You do not Share Adapted Material;
+b. if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material; and
+c. You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights.
+### Section 5 – Disclaimer of Warranties and Limitation of Liability.
+a. __Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You.__
+b. __To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You.__
+c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability.
+### Section 6 – Term and Termination.
+a. This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically.
+b. Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates:
+   1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or
+   2. upon express reinstatement by the Licensor.
+   For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License.
+c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License.
+d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
+### Section 7 – Other Terms and Conditions.
+a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed.
+b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License.
+### Section 8 – Interpretation.
+a. For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License.
+b. To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions.
+c. No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor.
+d. Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority.
+> Creative Commons is not a party to its public licenses. Notwithstanding, Creative Commons may elect to apply one of its public licenses to material it publishes and in those instances will be considered the “Licensor.” Except for the limited purpose of indicating that material is shared under a Creative Commons public license or as otherwise permitted by the Creative Commons policies published at [creativecommons.org/policies](http://creativecommons.org/policies), Creative Commons does not authorize the use of the trademark “Creative Commons” or any other trademark or logo of Creative Commons without its prior written consent including, without limitation, in connection with any unauthorized modifications to any of its public licenses or any other arrangements, understandings, or agreements concerning use of licensed material. For the avoidance of doubt, this paragraph does not form part of the public licenses.
+>
+> Creative Commons may be contacted at [creativecommons.org](http://creativecommons.org).

dataset/dataset_TM_eval.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import torch
+from torch.utils import data
+import numpy as np
+from os.path import join as pjoin
+import random
+import codecs as cs
+from tqdm import tqdm
+import utils.paramUtil as paramUtil
+from torch.utils.data._utils.collate import default_collate
+def collate_fn(batch):
+    batch.sort(key=lambda x: x[3], reverse=True)
+    return default_collate(batch)
+'''For use of training text-2-motion generative model'''
+class Text2MotionDataset(data.Dataset):
+    def __init__(self, dataset_name, is_test, w_vectorizer, feat_bias = 5, max_text_len = 20, unit_length = 4, shuffle=True):
+        self.max_length = 20
+        self.pointer = 0
+        self.dataset_name = dataset_name
+        self.is_test = is_test
+        self.max_text_len = max_text_len
+        self.unit_length = unit_length
+        self.w_vectorizer = w_vectorizer
+        if dataset_name == 't2m':
+            self.data_root = './dataset/HumanML3D'
+            self.motion_dir = pjoin(self.data_root, 'new_joint_vecs')
+            self.text_dir = pjoin(self.data_root, 'texts')
+            self.joints_num = 22
+            radius = 4
+            fps = 20
+            self.max_motion_length = 196
+            dim_pose = 263
+            kinematic_chain = paramUtil.t2m_kinematic_chain
+            self.meta_dir = 'checkpoints/t2m/VQVAEV3_CB1024_CMT_H1024_NRES3/meta'
+        elif dataset_name == 'kit':
+            self.data_root = './dataset/KIT-ML'
+            self.motion_dir = pjoin(self.data_root, 'new_joint_vecs')
+            self.text_dir = pjoin(self.data_root, 'texts')
+            self.joints_num = 21
+            radius = 240 * 8
+            fps = 12.5
+            dim_pose = 251
+            self.max_motion_length = 196
+            kinematic_chain = paramUtil.kit_kinematic_chain
+            self.meta_dir = 'checkpoints/kit/VQVAEV3_CB1024_CMT_H1024_NRES3/meta'
+        mean = np.load(pjoin(self.meta_dir, 'mean.npy'))
+        std = np.load(pjoin(self.meta_dir, 'std.npy'))
+        if is_test:
+            split_file = pjoin(self.data_root, 'test.txt')
+        else:
+            split_file = pjoin(self.data_root, 'val.txt')
+        min_motion_len = 40 if self.dataset_name =='t2m' else 24
+        # min_motion_len = 64
+        joints_num = self.joints_num
+        data_dict = {}
+        id_list = []
+        with cs.open(split_file, 'r') as f:
+            for line in f.readlines():
+                id_list.append(line.strip())
+        new_name_list = []
+        length_list = []
+        for name in tqdm(id_list):
+            try:
+                motion = np.load(pjoin(self.motion_dir, name + '.npy'))
+                if (len(motion)) < min_motion_len or (len(motion) >= 200):
+                    continue
+                text_data = []
+                flag = False
+                with cs.open(pjoin(self.text_dir, name + '.txt')) as f:
+                    for line in f.readlines():
+                        text_dict = {}
+                        line_split = line.strip().split('#')
+                        caption = line_split[0]
+                        tokens = line_split[1].split(' ')
+                        f_tag = float(line_split[2])
+                        to_tag = float(line_split[3])
+                        f_tag = 0.0 if np.isnan(f_tag) else f_tag
+                        to_tag = 0.0 if np.isnan(to_tag) else to_tag
+                        text_dict['caption'] = caption
+                        text_dict['tokens'] = tokens
+                        if f_tag == 0.0 and to_tag == 0.0:
+                            flag = True
+                            text_data.append(text_dict)
+                        else:
+                            try:
+                                n_motion = motion[int(f_tag*fps) : int(to_tag*fps)]
+                                if (len(n_motion)) < min_motion_len or (len(n_motion) >= 200):
+                                    continue
+                                new_name = random.choice('ABCDEFGHIJKLMNOPQRSTUVW') + '_' + name
+                                while new_name in data_dict:
+                                    new_name = random.choice('ABCDEFGHIJKLMNOPQRSTUVW') + '_' + name
+                                data_dict[new_name] = {'motion': n_motion,
+                                                       'length': len(n_motion),
+                                                       'text':[text_dict]}
+                                new_name_list.append(new_name)
+                                length_list.append(len(n_motion))
+                            except:
+                                print(line_split)
+                                print(line_split[2], line_split[3], f_tag, to_tag, name)
+                                # break
+                if flag:
+                    data_dict[name] = {'motion': motion,
+                                       'length': len(motion),
+                                       'text': text_data}
+                    new_name_list.append(name)
+                    length_list.append(len(motion))
+            except Exception as e:
+                # print(e)
+                pass
+        name_list, length_list = zip(*sorted(zip(new_name_list, length_list), key=lambda x: x[1]))
+        self.mean = mean
+        self.std = std
+        self.length_arr = np.array(length_list)
+        self.data_dict = data_dict
+        self.name_list = name_list
+        self.reset_max_len(self.max_length)
+        self.shuffle = shuffle
+    def reset_max_len(self, length):
+        assert length <= self.max_motion_length
+        self.pointer = np.searchsorted(self.length_arr, length)
+        print("Pointer Pointing at %d"%self.pointer)
+        self.max_length = length
+    def inv_transform(self, data):
+        return data * self.std + self.mean
+    def forward_transform(self, data):
+        return (data - self.mean) / self.std
+    def __len__(self):
+        return len(self.data_dict) - self.pointer
+    def __getitem__(self, item):
+        idx = self.pointer + item
+        name = self.name_list[idx]
+        data = self.data_dict[name]
+        # data = self.data_dict[self.name_list[idx]]
+        motion, m_length, text_list = data['motion'], data['length'], data['text']
+        # Randomly select a caption
+        text_data = random.choice(text_list)
+        caption, tokens = text_data['caption'], text_data['tokens']
+        if len(tokens) < self.max_text_len:
+            # pad with "unk"
+            tokens = ['sos/OTHER'] + tokens + ['eos/OTHER']
+            sent_len = len(tokens)
+            tokens = tokens + ['unk/OTHER'] * (self.max_text_len + 2 - sent_len)
+        else:
+            # crop
+            tokens = tokens[:self.max_text_len]
+            tokens = ['sos/OTHER'] + tokens + ['eos/OTHER']
+            sent_len = len(tokens)
+        pos_one_hots = []
+        word_embeddings = []
+        for token in tokens:
+            word_emb, pos_oh = self.w_vectorizer[token]
+            pos_one_hots.append(pos_oh[None, :])
+            word_embeddings.append(word_emb[None, :])
+        pos_one_hots = np.concatenate(pos_one_hots, axis=0)
+        word_embeddings = np.concatenate(word_embeddings, axis=0)
+        if self.unit_length < 10 and self.shuffle:
+            coin2 = np.random.choice(['single', 'single', 'double'])
+        else:
+            coin2 = 'single'
+        if coin2 == 'double':
+            m_length = (m_length // self.unit_length - 1) * self.unit_length
+        elif coin2 == 'single':
+            m_length = (m_length // self.unit_length) * self.unit_length
+        idx = random.randint(0, len(motion) - m_length)
+        motion = motion[idx:idx+m_length]
+        "Z Normalization"
+        motion = (motion - self.mean) / self.std
+        if m_length < self.max_motion_length and self.shuffle:
+            motion = np.concatenate([motion,
+                                     np.zeros((self.max_motion_length - m_length, motion.shape[1]))
+                                     ], axis=0)
+        return word_embeddings, pos_one_hots, caption, sent_len, motion, m_length, '_'.join(tokens), name
+def DATALoader(dataset_name, is_test,
+                batch_size, w_vectorizer,
+                num_workers = 8, unit_length = 4, shuffle=True) :
+    val_loader = torch.utils.data.DataLoader(Text2MotionDataset(dataset_name, is_test, w_vectorizer, unit_length=unit_length, shuffle=shuffle),
+                                              batch_size,
+                                              shuffle = shuffle,
+                                              num_workers=num_workers,
+                                              collate_fn=collate_fn,
+                                              drop_last = True)
+    return val_loader
+def cycle(iterable):
+    while True:
+        for x in iterable:
+            yield x

dataset/dataset_TM_train.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import torch
+from torch.utils import data
+import numpy as np
+from os.path import join as pjoin
+import random
+import codecs as cs
+from tqdm import tqdm
+import utils.paramUtil as paramUtil
+from torch.utils.data._utils.collate import default_collate
+import random
+import math
+def collate_fn(batch):
+    batch.sort(key=lambda x: x[3], reverse=True)
+    return default_collate(batch)
+'''For use of training text-2-motion generative model'''
+class Text2MotionDataset(data.Dataset):
+    def __init__(self, dataset_name, feat_bias = 5, unit_length = 4, codebook_size = 1024, tokenizer_name=None, up_low_sep=False):
+        self.max_length = 64
+        self.pointer = 0
+        self.dataset_name = dataset_name
+        self.up_low_sep = up_low_sep
+        self.unit_length = unit_length
+        # self.mot_start_idx = codebook_size
+        self.mot_end_idx = codebook_size
+        self.mot_pad_idx = codebook_size + 1 # [TODO] I think 513 (codebook_size+1) can be what ever, it will be croped out
+        if dataset_name == 't2m':
+            self.data_root = './dataset/HumanML3D'
+            self.motion_dir = pjoin(self.data_root, 'new_joint_vecs')
+            self.text_dir = pjoin(self.data_root, 'texts')
+            self.joints_num = 22
+            radius = 4
+            fps = 20
+            self.max_motion_length = 26 if unit_length == 8 else 50
+            dim_pose = 263
+            kinematic_chain = paramUtil.t2m_kinematic_chain
+        elif dataset_name == 'kit':
+            self.data_root = './dataset/KIT-ML'
+            self.motion_dir = pjoin(self.data_root, 'new_joint_vecs')
+            self.text_dir = pjoin(self.data_root, 'texts')
+            self.joints_num = 21
+            radius = 240 * 8
+            fps = 12.5
+            dim_pose = 251
+            self.max_motion_length = 26 if unit_length == 8 else 50
+            kinematic_chain = paramUtil.kit_kinematic_chain
+        split_file = pjoin(self.data_root, 'train.txt')
+        id_list = []
+        with cs.open(split_file, 'r') as f:
+            for line in f.readlines():
+                id_list.append(line.strip())
+        new_name_list = []
+        data_dict = {}
+        for name in tqdm(id_list):
+            try:
+                m_token_list = np.load(pjoin(tokenizer_name, '%s.npy'%name))
+                # Read text
+                with cs.open(pjoin(self.text_dir, name + '.txt')) as f:
+                    text_data = []
+                    flag = False
+                    lines = f.readlines()
+                    for line in lines:
+                        try:
+                            text_dict = {}
+                            line_split = line.strip().split('#')
+                            caption = line_split[0]
+                            t_tokens = line_split[1].split(' ')
+                            f_tag = float(line_split[2])
+                            to_tag = float(line_split[3])
+                            f_tag = 0.0 if np.isnan(f_tag) else f_tag
+                            to_tag = 0.0 if np.isnan(to_tag) else to_tag
+                            text_dict['caption'] = caption
+                            text_dict['tokens'] = t_tokens
+                            if f_tag == 0.0 and to_tag == 0.0:
+                                flag = True
+                                text_data.append(text_dict)
+                            else:
+                                # [INFO] Check with KIT, doesn't come here that mean f_tag & to_tag are 0.0 (tag for caption from-to frames)
+                                m_token_list_new = [tokens[int(f_tag*fps/unit_length) : int(to_tag*fps/unit_length)] for tokens in m_token_list if int(f_tag*fps/unit_length) < int(to_tag*fps/unit_length)]
+                                if len(m_token_list_new) == 0:
+                                    continue
+                                new_name = '%s_%f_%f'%(name, f_tag, to_tag)
+                                data_dict[new_name] = {'m_token_list': m_token_list_new,
+                                                       'text':[text_dict]}
+                                new_name_list.append(new_name)
+                        except:
+                            pass
+                if flag:
+                    data_dict[name] = {'m_token_list': m_token_list,
+                                       'text':text_data}
+                    new_name_list.append(name)
+            except:
+                pass
+        self.data_dict = data_dict
+        self.name_list = new_name_list
+    def __len__(self):
+        return len(self.data_dict)
+    def __getitem__(self, item):
+        data = self.data_dict[self.name_list[item]]
+        m_token_list, text_list = data['m_token_list'], data['text']
+        m_tokens = random.choice(m_token_list)
+        text_data = random.choice(text_list)
+        caption= text_data['caption']
+        coin = np.random.choice([False, False, True])
+        # print(len(m_tokens))
+        if coin:
+            # drop one token at the head or tail
+            coin2 = np.random.choice([True, False])
+            if coin2:
+                m_tokens = m_tokens[:-1]
+            else:
+                m_tokens = m_tokens[1:]
+        m_tokens_len = m_tokens.shape[0]
+        if self.up_low_sep:
+            new_len = random.randint(20, self.max_motion_length-1)
+            len_mult = math.ceil(new_len/m_tokens_len)
+            m_tokens = np.tile(m_tokens, (len_mult, 1))[:new_len]
+            m_tokens_len = new_len
+            if m_tokens_len+1 < self.max_motion_length:
+                m_tokens = np.concatenate([m_tokens, np.ones((1, 2), dtype=int) * self.mot_end_idx, np.ones((self.max_motion_length-1-m_tokens_len, 2), dtype=int) * self.mot_pad_idx], axis=0)
+            else:
+                m_tokens = np.concatenate([m_tokens, np.ones((1, 2), dtype=int) * self.mot_end_idx], axis=0)
+        else:
+            if m_tokens_len+1 < self.max_motion_length:
+                m_tokens = np.concatenate([m_tokens, np.ones((1), dtype=int) * self.mot_end_idx, np.ones((self.max_motion_length-1-m_tokens_len), dtype=int) * self.mot_pad_idx], axis=0)
+            else:
+                m_tokens = np.concatenate([m_tokens, np.ones((1), dtype=int) * self.mot_end_idx], axis=0)
+        return caption, m_tokens, m_tokens_len
+def DATALoader(dataset_name,
+                batch_size, codebook_size, tokenizer_name, unit_length=4,
+                num_workers = 8, up_low_sep=False) :
+    train_loader = torch.utils.data.DataLoader(Text2MotionDataset(dataset_name, codebook_size = codebook_size, tokenizer_name = tokenizer_name, unit_length=unit_length, up_low_sep=up_low_sep),
+                                              batch_size,
+                                              shuffle=True,
+                                              num_workers=num_workers,
+                                              #collate_fn=collate_fn,
+                                              drop_last = True)
+    return train_loader
+def cycle(iterable):
+    while True:
+        for x in iterable:
+            yield x

dataset/dataset_VQ.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import torch
+from torch.utils import data
+import numpy as np
+from os.path import join as pjoin
+import random
+import codecs as cs
+from tqdm import tqdm
+class VQMotionDataset(data.Dataset):
+    def __init__(self, dataset_name, window_size = 64, unit_length = 4):
+        self.window_size = window_size
+        self.unit_length = unit_length
+        self.dataset_name = dataset_name
+        if dataset_name == 't2m':
+            self.data_root = './dataset/HumanML3D'
+            self.motion_dir = pjoin(self.data_root, 'new_joint_vecs')
+            self.text_dir = pjoin(self.data_root, 'texts')
+            self.joints_num = 22
+            self.max_motion_length = 196
+            self.meta_dir = 'checkpoints/t2m/VQVAEV3_CB1024_CMT_H1024_NRES3/meta'
+        elif dataset_name == 'kit':
+            self.data_root = './dataset/KIT-ML'
+            self.motion_dir = pjoin(self.data_root, 'new_joint_vecs')
+            self.text_dir = pjoin(self.data_root, 'texts')
+            self.joints_num = 21
+            self.max_motion_length = 196
+            self.meta_dir = 'checkpoints/kit/VQVAEV3_CB1024_CMT_H1024_NRES3/meta'
+        joints_num = self.joints_num
+        mean = np.load(pjoin(self.meta_dir, 'mean.npy'))
+        std = np.load(pjoin(self.meta_dir, 'std.npy'))
+        split_file = pjoin(self.data_root, 'train.txt')
+        self.data = []
+        self.lengths = []
+        id_list = []
+        with cs.open(split_file, 'r') as f:
+            for line in f.readlines():
+                id_list.append(line.strip())
+        for name in tqdm(id_list):
+            try:
+                motion = np.load(pjoin(self.motion_dir, name + '.npy'))
+                if motion.shape[0] < self.window_size:
+                    continue
+                self.lengths.append(motion.shape[0] - self.window_size)
+                self.data.append(motion)
+            except:
+                # Some motion may not exist in KIT dataset
+                pass
+        self.mean = mean
+        self.std = std
+        print("Total number of motions {}".format(len(self.data)))
+    def inv_transform(self, data):
+        return data * self.std + self.mean
+    def compute_sampling_prob(self) :
+        prob = np.array(self.lengths, dtype=np.float32)
+        prob /= np.sum(prob)
+        return prob
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, item):
+        motion = self.data[item]
+        idx = random.randint(0, len(motion) - self.window_size)
+        motion = motion[idx:idx+self.window_size]
+        "Z Normalization"
+        motion = (motion - self.mean) / self.std
+        return motion
+def DATALoader(dataset_name,
+               batch_size,
+               num_workers = 8,
+               window_size = 64,
+               unit_length = 4):
+    trainSet = VQMotionDataset(dataset_name, window_size=window_size, unit_length=unit_length)
+    prob = trainSet.compute_sampling_prob()
+    sampler = torch.utils.data.WeightedRandomSampler(prob, num_samples = len(trainSet) * 1000, replacement=True)
+    train_loader = torch.utils.data.DataLoader(trainSet,
+                                              batch_size,
+                                              shuffle=True,
+                                              #sampler=sampler,
+                                              num_workers=num_workers,
+                                              #collate_fn=collate_fn,
+                                              drop_last = True)
+    return train_loader
+def cycle(iterable):
+    while True:
+        for x in iterable:
+            yield x

dataset/dataset_tokenize.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import torch
+from torch.utils import data
+import numpy as np
+from os.path import join as pjoin
+import random
+import codecs as cs
+from tqdm import tqdm
+class VQMotionDataset(data.Dataset):
+    def __init__(self, dataset_name, feat_bias = 5, window_size = 64, unit_length = 8, fill_max_len=False):
+        self.window_size = window_size
+        self.unit_length = unit_length
+        self.feat_bias = feat_bias
+        self.fill_max_len = fill_max_len
+        self.dataset_name = dataset_name
+        min_motion_len = 40 if dataset_name =='t2m' else 24
+        if dataset_name == 't2m':
+            self.data_root = './dataset/HumanML3D'
+            self.motion_dir = pjoin(self.data_root, 'new_joint_vecs')
+            self.text_dir = pjoin(self.data_root, 'texts')
+            self.joints_num = 22
+            radius = 4
+            fps = 20
+            self.max_motion_length = 196
+            self.dim_pose = 263
+            self.meta_dir = 'checkpoints/t2m/VQVAEV3_CB1024_CMT_H1024_NRES3/meta'
+            #kinematic_chain = paramUtil.t2m_kinematic_chain
+        elif dataset_name == 'kit':
+            self.data_root = './dataset/KIT-ML'
+            self.motion_dir = pjoin(self.data_root, 'new_joint_vecs')
+            self.text_dir = pjoin(self.data_root, 'texts')
+            self.joints_num = 21
+            radius = 240 * 8
+            fps = 12.5
+            self.dim_pose = 251
+            self.max_motion_length = 196
+            self.meta_dir = 'checkpoints/kit/VQVAEV3_CB1024_CMT_H1024_NRES3/meta'
+            #kinematic_chain = paramUtil.kit_kinematic_chain
+        joints_num = self.joints_num
+        mean = np.load(pjoin(self.meta_dir, 'mean.npy'))
+        std = np.load(pjoin(self.meta_dir, 'std.npy'))
+        split_file = pjoin(self.data_root, 'train.txt')
+        data_dict = {}
+        id_list = []
+        with cs.open(split_file, 'r') as f:
+            for line in f.readlines():
+                id_list.append(line.strip())
+        new_name_list = []
+        length_list = []
+        for name in tqdm(id_list):
+            try:
+                motion = np.load(pjoin(self.motion_dir, name + '.npy'))
+                if (len(motion)) < min_motion_len or (len(motion) >= 200):
+                    continue
+                data_dict[name] = {'motion': motion,
+                                   'length': len(motion),
+                                   'name': name}
+                new_name_list.append(name)
+                length_list.append(len(motion))
+            except:
+                # Some motion may not exist in KIT dataset
+                pass
+        self.mean = mean
+        self.std = std
+        self.length_arr = np.array(length_list)
+        self.data_dict = data_dict
+        self.name_list = new_name_list
+    def inv_transform(self, data):
+        return data * self.std + self.mean
+    def __len__(self):
+        return len(self.data_dict)
+    def __getitem__(self, item):
+        name = self.name_list[item]
+        data = self.data_dict[name]
+        motion, m_length = data['motion'], data['length']
+        m_length = (m_length // self.unit_length) * self.unit_length
+        idx = random.randint(0, len(motion) - m_length)
+        motion = motion[idx:idx+m_length]
+        if self.fill_max_len:
+            motion_zero = np.zeros((self.max_motion_length, self.dim_pose))
+            motion_zero[:m_length] = motion
+            motion = motion_zero
+            motion = (motion - self.mean) / self.std
+            return motion, m_length
+        "Z Normalization"
+        motion = (motion - self.mean) / self.std
+        return motion, name
+def DATALoader(dataset_name,
+                batch_size = 1,
+                num_workers = 8, unit_length = 4, shuffle=True) :
+    train_loader = torch.utils.data.DataLoader(VQMotionDataset(dataset_name, unit_length=unit_length, fill_max_len=batch_size!=1),
+                                              batch_size,
+                                              shuffle=shuffle,
+                                              num_workers=num_workers,
+                                              #collate_fn=collate_fn,
+                                              drop_last = True)
+    return train_loader
+def cycle(iterable):
+    while True:
+        for x in iterable:
+            yield x

environment.yml ADDED Viewed

	@@ -0,0 +1,227 @@

+name: MMM
+channels:
+  - pytorch
+  - nvidia
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_gnu
+  - abseil-cpp=20230802.0=h6a678d5_2
+  - absl-py=2.1.0=py312h06a4308_0
+  - aiohttp=3.9.5=py312h5eee18b_0
+  - aiosignal=1.2.0=pyhd3eb1b0_0
+  - asttokens=2.4.1=pyhd8ed1ab_0
+  - attrs=23.1.0=py312h06a4308_0
+  - blas=1.0=mkl
+  - blinker=1.6.2=py312h06a4308_0
+  - brotli=1.0.9=h5eee18b_8
+  - brotli-bin=1.0.9=h5eee18b_8
+  - brotli-python=1.0.9=py312h6a678d5_8
+  - bzip2=1.0.8=h5eee18b_6
+  - c-ares=1.19.1=h5eee18b_0
+  - ca-certificates=2024.6.2=hbcca054_0
+  - cachetools=5.3.3=py312h06a4308_0
+  - certifi=2024.2.2=py312h06a4308_0
+  - cffi=1.16.0=py312h5eee18b_1
+  - charset-normalizer=2.0.4=pyhd3eb1b0_0
+  - click=8.1.7=py312h06a4308_0
+  - comm=0.2.2=pyhd8ed1ab_0
+  - contourpy=1.2.0=py312hdb19cb5_0
+  - cryptography=42.0.5=py312hdda0065_1
+  - cuda-cudart=11.8.89=0
+  - cuda-cupti=11.8.87=0
+  - cuda-libraries=11.8.0=0
+  - cuda-nvrtc=11.8.89=0
+  - cuda-nvtx=11.8.86=0
+  - cuda-runtime=11.8.0=0
+  - cuda-version=12.5=3
+  - cycler=0.11.0=pyhd3eb1b0_0
+  - cyrus-sasl=2.1.28=h52b45da_1
+  - dbus=1.13.18=hb2f20db_0
+  - debugpy=1.6.7=py312h6a678d5_0
+  - decorator=5.1.1=pyhd8ed1ab_0
+  - exceptiongroup=1.2.0=pyhd8ed1ab_2
+  - executing=2.0.1=pyhd8ed1ab_0
+  - expat=2.6.2=h6a678d5_0
+  - ffmpeg=4.3=hf484d3e_0
+  - filelock=3.13.1=py312h06a4308_0
+  - fontconfig=2.14.1=h4c34cd2_2
+  - fonttools=4.51.0=py312h5eee18b_0
+  - freetype=2.12.1=h4a9f257_0
+  - frozenlist=1.4.0=py312h5eee18b_0
+  - glib=2.78.4=h6a678d5_0
+  - glib-tools=2.78.4=h6a678d5_0
+  - gmp=6.2.1=h295c915_3
+  - gnutls=3.6.15=he1e5248_0
+  - google-auth=2.29.0=py312h06a4308_0
+  - google-auth-oauthlib=0.4.1=py_2
+  - grpc-cpp=1.48.2=he1ff14a_4
+  - grpcio=1.48.2=py312he1ff14a_4
+  - gst-plugins-base=1.14.1=h6a678d5_1
+  - gstreamer=1.14.1=h5eee18b_1
+  - gtest=1.14.0=hdb19cb5_1
+  - icu=73.1=h6a678d5_0
+  - idna=3.7=py312h06a4308_0
+  - importlib-metadata=7.1.0=pyha770c72_0
+  - importlib_metadata=7.1.0=hd8ed1ab_0
+  - intel-openmp=2023.1.0=hdb19cb5_46306
+  - ipykernel=6.29.3=pyhd33586a_0
+  - ipython=8.25.0=pyh707e725_0
+  - jedi=0.19.1=pyhd8ed1ab_0
+  - jinja2=3.1.4=py312h06a4308_0
+  - jpeg=9e=h5eee18b_1
+  - jupyter_client=8.6.2=pyhd8ed1ab_0
+  - jupyter_core=5.5.0=py312h06a4308_0
+  - kiwisolver=1.4.4=py312h6a678d5_0
+  - krb5=1.20.1=h143b758_1
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - lerc=3.0=h295c915_0
+  - libbrotlicommon=1.0.9=h5eee18b_8
+  - libbrotlidec=1.0.9=h5eee18b_8
+  - libbrotlienc=1.0.9=h5eee18b_8
+  - libclang=14.0.6=default_hc6dbbc7_1
+  - libclang13=14.0.6=default_he11475f_1
+  - libcublas=11.11.3.6=0
+  - libcufft=10.9.0.58=0
+  - libcufile=1.10.0.4=0
+  - libcups=2.4.2=h2d74bed_1
+  - libcurand=10.3.6.39=0
+  - libcusolver=11.4.1.48=0
+  - libcusparse=11.7.5.86=0
+  - libdeflate=1.17=h5eee18b_1
+  - libedit=3.1.20230828=h5eee18b_0
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc-ng=13.2.0=h77fa898_7
+  - libgfortran-ng=11.2.0=h00389a5_1
+  - libgfortran5=11.2.0=h1234567_1
+  - libglib=2.78.4=hdc74915_0
+  - libgomp=13.2.0=h77fa898_7
+  - libiconv=1.16=h5eee18b_3
+  - libidn2=2.3.4=h5eee18b_0
+  - libjpeg-turbo=2.0.0=h9bf148f_0
+  - libllvm14=14.0.6=hdb19cb5_3
+  - libnpp=11.8.0.86=0
+  - libnvjpeg=11.9.0.86=0
+  - libpng=1.6.39=h5eee18b_0
+  - libpq=12.17=hdbd6064_0
+  - libprotobuf=3.20.3=he621ea3_0
+  - libsodium=1.0.18=h36c2ea0_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtasn1=4.19.0=h5eee18b_0
+  - libtiff=4.5.1=h6a678d5_0
+  - libunistring=0.9.10=h27cfd23_0
+  - libuuid=1.41.5=h5eee18b_0
+  - libwebp-base=1.3.2=h5eee18b_0
+  - libxcb=1.15=h7f8727e_0
+  - libxkbcommon=1.0.1=h5eee18b_1
+  - libxml2=2.10.4=hfdd30dd_2
+  - llvm-openmp=14.0.6=h9e868ea_0
+  - lz4-c=1.9.4=h6a678d5_1
+  - markdown=3.4.1=py312h06a4308_0
+  - markupsafe=2.1.3=py312h5eee18b_0
+  - matplotlib=3.8.4=py312h06a4308_0
+  - matplotlib-base=3.8.4=py312h526ad5a_0
+  - matplotlib-inline=0.1.7=pyhd8ed1ab_0
+  - mkl=2023.1.0=h213fc3f_46344
+  - mkl-service=2.4.0=py312h5eee18b_1
+  - mkl_fft=1.3.8=py312h5eee18b_0
+  - mkl_random=1.2.4=py312hdb19cb5_0
+  - mpmath=1.3.0=py312h06a4308_0
+  - multidict=6.0.4=py312h5eee18b_0
+  - mysql=5.7.24=h721c034_2
+  - ncurses=6.4=h6a678d5_0
+  - nest-asyncio=1.6.0=pyhd8ed1ab_0
+  - nettle=3.7.3=hbbd107a_1
+  - networkx=3.1=py312h06a4308_0
+  - numpy=1.26.4=py312hc5e2394_0
+  - numpy-base=1.26.4=py312h0da6c21_0
+  - oauthlib=3.2.2=py312h06a4308_0
+  - openh264=2.1.1=h4ff587b_0
+  - openjpeg=2.4.0=h3ad879b_0
+  - openssl=3.3.0=h4ab18f5_3
+  - packaging=23.2=py312h06a4308_0
+  - parso=0.8.4=pyhd8ed1ab_0
+  - pcre2=10.42=hebb0a14_1
+  - pexpect=4.9.0=pyhd8ed1ab_0
+  - pickleshare=0.7.5=py_1003
+  - pillow=10.3.0=py312h5eee18b_0
+  - pip=24.0=py312h06a4308_0
+  - platformdirs=4.2.2=pyhd8ed1ab_0
+  - plotly=5.19.0=py312he106c6f_0
+  - ply=3.11=py312h06a4308_1
+  - prompt-toolkit=3.0.42=pyha770c72_0
+  - protobuf=3.20.3=py312h6a678d5_0
+  - psutil=5.9.0=py312h5eee18b_0
+  - ptyprocess=0.7.0=pyhd3deb0d_0
+  - pure_eval=0.2.2=pyhd8ed1ab_0
+  - pyasn1=0.4.8=pyhd3eb1b0_0
+  - pyasn1-modules=0.2.8=py_0
+  - pybind11-abi=5=hd3eb1b0_0
+  - pycparser=2.21=pyhd3eb1b0_0
+  - pygments=2.18.0=pyhd8ed1ab_0
+  - pyjwt=2.8.0=py312h06a4308_0
+  - pyopenssl=24.0.0=py312h06a4308_0
+  - pyparsing=3.0.9=py312h06a4308_0
+  - pyqt=5.15.10=py312h6a678d5_0
+  - pyqt5-sip=12.13.0=py312h5eee18b_0
+  - pysocks=1.7.1=py312h06a4308_0
+  - python=3.12.3=h996f2a0_1
+  - python-dateutil=2.9.0=pyhd8ed1ab_0
+  - pytorch=2.3.0=py3.12_cuda11.8_cudnn8.7.0_0
+  - pytorch-cuda=11.8=h7e8668a_5
+  - pytorch-mutex=1.0=cuda
+  - pyyaml=6.0.1=py312h5eee18b_0
+  - pyzmq=25.1.2=py312h6a678d5_0
+  - qt-main=5.15.2=h53bd1ea_10
+  - re2=2022.04.01=h295c915_0
+  - readline=8.2=h5eee18b_0
+  - requests=2.32.2=py312h06a4308_0
+  - requests-oauthlib=1.3.0=py_0
+  - rsa=4.7.2=pyhd3eb1b0_1
+  - scipy=1.13.0=py312hc5e2394_0
+  - setuptools=69.5.1=py312h06a4308_0
+  - sip=6.7.12=py312h6a678d5_0
+  - six=1.16.0=pyhd3eb1b0_1
+  - sqlite=3.45.3=h5eee18b_0
+  - stack_data=0.6.2=pyhd8ed1ab_0
+  - sympy=1.12=py312h06a4308_0
+  - tbb=2021.8.0=hdb19cb5_0
+  - tenacity=8.2.2=py312h06a4308_1
+  - tensorboard=2.6.0=py_0
+  - tensorboard-plugin-wit=1.6.0=py_0
+  - tk=8.6.14=h39e8969_0
+  - torchvision=0.18.0=py312_cu118
+  - tornado=6.3.3=py312h5eee18b_0
+  - traitlets=5.14.3=pyhd8ed1ab_0
+  - typing_extensions=4.11.0=py312h06a4308_0
+  - tzdata=2024a=h04d1e81_0
+  - unicodedata2=15.1.0=py312h5eee18b_0
+  - urllib3=2.2.1=py312h06a4308_0
+  - wcwidth=0.2.13=pyhd8ed1ab_0
+  - werkzeug=3.0.3=py312h06a4308_0
+  - wheel=0.43.0=py312h06a4308_0
+  - xz=5.4.6=h5eee18b_1
+  - yaml=0.2.5=h7b6447c_0
+  - yarl=1.9.3=py312h5eee18b_0
+  - zeromq=4.3.5=h6a678d5_0
+  - zipp=3.17.0=pyhd8ed1ab_0
+  - zlib=1.2.13=h5eee18b_1
+  - zstd=1.5.5=hc292b87_2
+  - pip:
+      - beautifulsoup4==4.12.3
+      - einops==0.8.0
+      - fastjsonschema==2.19.1
+      - fsspec==2024.5.0
+      - ftfy==6.2.0
+      - gdown==5.2.0
+      - jsonschema==4.22.0
+      - jsonschema-specifications==2023.12.1
+      - nbformat==5.10.4
+      - referencing==0.35.1
+      - regex==2024.5.15
+      - rpds-py==0.18.1
+      - soupsieve==2.5
+      - tqdm==4.66.4

exit/t2m-mean.npy ADDED Viewed

Binary file (2.23 kB). View file

exit/t2m-std.npy ADDED Viewed

Binary file (2.23 kB). View file

exit/utils.py ADDED Viewed

	@@ -0,0 +1,305 @@

+def get_model(model):
+    if hasattr(model, 'module'):
+        return model.module
+    return model
+import numpy as np
+import torch
+from utils.motion_process import recover_from_ric
+import copy
+import plotly.graph_objects as go
+import shutil
+import datetime
+import os
+import math
+kit_bone = [[0, 11], [11, 12], [12, 13], [13, 14], [14, 15], [0, 16], [16, 17], [17, 18], [18, 19], [19, 20], [0, 1], [1, 2], [2, 3], [3, 4], [3, 5], [5, 6], [6, 7], [3, 8], [8, 9], [9, 10]]
+t2m_bone = [[0,2], [2,5],[5,8],[8,11],
+            [0,1],[1,4],[4,7],[7,10],
+            [0,3],[3,6],[6,9],[9,12],[12,15],
+            [9,14],[14,17],[17,19],[19,21],
+            [9,13],[13,16],[16,18],[18,20]]
+kit_kit_bone = kit_bone + (np.array(kit_bone)+21).tolist()
+t2m_t2m_bone = t2m_bone + (np.array(t2m_bone)+22).tolist()
+def axis_standard(skeleton):
+    skeleton = skeleton.copy()
+#     skeleton = -skeleton
+    # skeleton[:, :, 0] *= -1
+    # xyz => zxy
+    skeleton[..., [1, 2]] = skeleton[..., [2, 1]]
+    skeleton[..., [0, 1]] = skeleton[..., [1, 0]]
+    return skeleton
+def visualize_2motions(motion1, std, mean, dataset_name, length, motion2=None, save_path=None):
+    motion1 = motion1 * std + mean
+    if motion2 is not None:
+        motion2 = motion2 * std + mean
+    if dataset_name == 'kit':
+        first_total_standard = 60
+        bone_link = kit_bone
+        if motion2 is not None:
+            bone_link = kit_kit_bone
+        joints_num = 21
+        scale = 1/1000
+    else:
+        first_total_standard = 63
+        bone_link = t2m_bone
+        if motion2 is not None:
+            bone_link = t2m_t2m_bone
+        joints_num = 22
+        scale = 1#/1000
+    joint1 = recover_from_ric(torch.from_numpy(motion1).float(), joints_num).numpy()
+    if motion2 is not None:
+        joint2 = recover_from_ric(torch.from_numpy(motion2).float(), joints_num).numpy()
+        joint_original_forward = np.concatenate((joint1, joint2), axis=1)
+    else:
+        joint_original_forward = joint1
+    animate3d(joint_original_forward[:length]*scale,
+              BONE_LINK=bone_link,
+              first_total_standard=first_total_standard,
+              save_path=save_path) # 'init.html'
+def animate3d(skeleton, BONE_LINK=t2m_bone, first_total_standard=-1, root_path=None, root_path2=None, save_path=None, axis_standard=axis_standard, axis_visible=True):
+    # [animation] https://community.plotly.com/t/3d-scatter-animation/46368/6
+    SHIFT_SCALE = 0
+    START_FRAME = 0
+    NUM_FRAMES = skeleton.shape[0]
+    skeleton = skeleton[START_FRAME:NUM_FRAMES+START_FRAME]
+    skeleton = axis_standard(skeleton)
+    if BONE_LINK is not None:
+        # ground truth
+        bone_ids = np.array(BONE_LINK)
+        _from = skeleton[:, bone_ids[:, 0]]
+        _to = skeleton[:, bone_ids[:, 1]]
+        # [f 3(from,to,none) d]
+        bones = np.empty(
+            (_from.shape[0], 3*_from.shape[1], 3), dtype=_from.dtype)
+        bones[:, 0::3] = _from
+        bones[:, 1::3] = _to
+        bones[:, 2::3] = np.full_like(_to, None)
+        display_points = bones
+        mode = 'lines+markers'
+    else:
+        display_points = skeleton
+        mode = 'markers'
+    # follow this thread: https://community.plotly.com/t/3d-scatter-animation/46368/6
+    fig = go.Figure(
+        data=go.Scatter3d(  x=display_points[0, :first_total_standard, 0],
+                            y=display_points[0, :first_total_standard, 1],
+                            z=display_points[0, :first_total_standard, 2],
+                            name='Nodes0',
+                            mode=mode,
+                            marker=dict(size=3, color='blue',)),
+                            layout=go.Layout(
+                                scene=dict(aspectmode='data',
+                                camera=dict(eye=dict(x=3, y=0, z=0.1)))
+                                )
+                            )
+    if first_total_standard != -1:
+        fig.add_traces(data=go.Scatter3d(
+                                x=display_points[0, first_total_standard:, 0],
+                                y=display_points[0, first_total_standard:, 1],
+                                z=display_points[0, first_total_standard:, 2],
+                                name='Nodes1',
+                                mode=mode,
+                                marker=dict(size=3, color='red',)))
+    if root_path is not None:
+        root_path = axis_standard(root_path)
+        fig.add_traces(data=go.Scatter3d(
+                                    x=root_path[:, 0],
+                                    y=root_path[:, 1],
+                                    z=root_path[:, 2],
+                                    name='root_path',
+                                    mode=mode,
+                                    marker=dict(size=2, color='green',)))
+    if root_path2 is not None:
+        root_path2 = axis_standard(root_path2)
+        fig.add_traces(data=go.Scatter3d(
+                                    x=root_path2[:, 0],
+                                    y=root_path2[:, 1],
+                                    z=root_path2[:, 2],
+                                    name='root_path2',
+                                    mode=mode,
+                                    marker=dict(size=2, color='red',)))
+    frames = []
+    # frames.append({'data':copy.deepcopy(fig['data']),'name':f'frame{0}'})
+    def update_trace(k):
+        fig.update_traces(x=display_points[k, :first_total_standard, 0],
+            y=display_points[k, :first_total_standard, 1],
+            z=display_points[k, :first_total_standard, 2],
+            mode=mode,
+            marker=dict(size=3, ),
+            # traces=[0],
+            selector = ({'name':'Nodes0'}))
+        if first_total_standard != -1:
+            fig.update_traces(x=display_points[k, first_total_standard:, 0],
+                y=display_points[k, first_total_standard:, 1],
+                z=display_points[k, first_total_standard:, 2],
+                mode=mode,
+                marker=dict(size=3, ),
+                # traces=[0],
+                selector = ({'name':'Nodes1'}))
+    for k in range(0, len(display_points)):
+        update_trace(k)
+        frames.append({'data':copy.deepcopy(fig['data']),'name':f'frame{k}'})
+    update_trace(0)
+    # frames = [go.Frame(data=[go.Scatter3d(
+    #     x=display_points[k, :, 0],
+    #     y=display_points[k, :, 1],
+    #     z=display_points[k, :, 2],
+    #     mode=mode,
+    #     marker=dict(size=3, ))],
+    #     traces=[0],
+    #     name=f'frame{k}'
+    # )for k in range(len(display_points))]
+    fig.update(frames=frames)
+    def frame_args(duration):
+        return {
+            "frame": {"duration": duration},
+            "mode": "immediate",
+            "fromcurrent": True,
+            "transition": {"duration": duration, "easing": "linear"},
+        }
+    sliders = [
+        {"pad": {"b": 10, "t": 60},
+         "len": 0.9,
+         "x": 0.1,
+         "y": 0,
+         "steps": [
+            {"args": [[f.name], frame_args(0)],
+             "label": str(k),
+             "method": "animate",
+             } for k, f in enumerate(fig.frames)
+        ]
+        }
+    ]
+    fig.update_layout(
+        updatemenus=[{"buttons": [
+            {
+                "args": [None, frame_args(1000/25)],
+                "label": "Play",
+                "method": "animate",
+            },
+            {
+                "args": [[None], frame_args(0)],
+                "label": "Pause",
+                "method": "animate",
+            }],
+            "direction": "left",
+            "pad": {"r": 10, "t": 70},
+            "type": "buttons",
+            "x": 0.1,
+            "y": 0,
+        }
+        ],
+        sliders=sliders
+    )
+    range_x, aspect_x = get_range(skeleton, 0)
+    range_y, aspect_y = get_range(skeleton, 1)
+    range_z, aspect_z = get_range(skeleton, 2)
+    fig.update_layout(scene=dict(xaxis=dict(range=range_x, visible=axis_visible),
+                                 yaxis=dict(range=range_y, visible=axis_visible),
+                                 zaxis=dict(range=range_z, visible=axis_visible)
+                                 ),
+                      scene_aspectmode='manual',
+                      scene_aspectratio=dict(
+                          x=aspect_x, y=aspect_y, z=aspect_z)
+                      )
+    fig.update_layout(sliders=sliders)
+    fig.show()
+    if save_path is not None:
+        fig.write_html(save_path, auto_open=False, include_plotlyjs='cdn', full_html=False)
+def get_range(skeleton, index):
+    _min, _max = skeleton[:, :, index].min(), skeleton[:, :, index].max()
+    return [_min, _max], _max-_min
+# [INFO] from http://juditacs.github.io/2018/12/27/masked-attention.html
+def generate_src_mask(T, length):
+    B = len(length)
+    mask = torch.arange(T).repeat(B, 1).to(length.device) < length.unsqueeze(-1)
+    return mask
+def copyComplete(source, target):
+    '''https://stackoverflow.com/questions/19787348/copy-file-keep-permissions-and-owner'''
+    # copy content, stat-info (mode too), timestamps...
+    if os.path.isfile(source):
+        shutil.copy2(source, target)
+    else:
+        shutil.copytree(source, target, ignore=shutil.ignore_patterns('__pycache__'))
+    # copy owner and group
+    st = os.stat(source)
+    os.chown(target, st.st_uid, st.st_gid)
+data_permission = os.access('/data/epinyoan', os.R_OK | os.W_OK | os.X_OK)
+base_dir = '/data' if data_permission else '/home'
+def init_save_folder(args, copysource=True):
+    import glob
+    global base_dir
+    if args.exp_name != 'TEMP':
+        date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+        args.out_dir = f"./{args.out_dir}/{date}_{args.exp_name}/"
+        save_source = f'{args.out_dir}source/'
+        os.makedirs(save_source, mode=os.umask(0), exist_ok=False)
+    else:
+        args.out_dir = os.path.join(args.out_dir, f'{args.exp_name}')
+def uniform(shape, device = None):
+    return torch.zeros(shape, device = device).float().uniform_(0, 1)
+def cosine_schedule(t):
+    return torch.cos(t * math.pi * 0.5)
+def log(t, eps = 1e-20):
+    return torch.log(t.clamp(min = eps))
+def gumbel_noise(t):
+    noise = torch.zeros_like(t).uniform_(0, 1)
+    return -log(-log(noise))
+def gumbel_sample(t, temperature = 1., dim = -1):
+    return ((t / max(temperature, 1e-10)) + gumbel_noise(t)).argmax(dim = dim)
+def top_k(logits, thres = 0.9):
+    # [INFO] select top 10% samples of last index by fill value to the rest as -inf
+    k = math.ceil((1 - thres) * logits.shape[-1])
+    val, ind = logits.topk(k, dim = -1)
+    probs = torch.full_like(logits, float('-inf'))
+    probs.scatter_(2, ind, val)
+    return probs
+# https://github.com/lucidrains/DALLE-pytorch/issues/318
+# https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+from torch.nn import functional as F
+def top_p(logits, thres = 0.1):
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+    # # Remove tokens with cumulative probability above the threshold
+    sorted_indices_to_remove = cumulative_probs > (1 - thres)
+    # Shift the indices to the right to keep also the first token above the threshold
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    # # scatter sorted tensors to original indexing
+    indices_to_remove = sorted_indices_to_remove.scatter(dim=-1, index=sorted_indices, src=sorted_indices_to_remove)
+    logits[indices_to_remove] = float('-inf')
+    return logits

generate.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import torch
+import clip
+import models.vqvae as vqvae
+from models.vqvae_sep import VQVAE_SEP
+import models.t2m_trans as trans
+import models.t2m_trans_uplow as trans_uplow
+import numpy as np
+from exit.utils import visualize_2motions
+import options.option_transformer as option_trans
+##### ---- CLIP ---- #####
+clip_model, clip_preprocess = clip.load("ViT-B/32", device=torch.device('cpu'), jit=False)  # Must set jit=False for training
+clip.model.convert_weights(clip_model)  # Actually this line is unnecessary since clip by default already on float16
+clip_model.eval()
+for p in clip_model.parameters():
+    p.requires_grad = False
+# https://github.com/openai/CLIP/issues/111
+class TextCLIP(torch.nn.Module):
+    def __init__(self, model) :
+        super(TextCLIP, self).__init__()
+        self.model = model
+    def forward(self,text):
+        with torch.no_grad():
+            word_emb = self.model.token_embedding(text).type(self.model.dtype)
+            word_emb = word_emb + self.model.positional_embedding.type(self.model.dtype)
+            word_emb = word_emb.permute(1, 0, 2)  # NLD -> LND
+            word_emb = self.model.transformer(word_emb)
+            word_emb = self.model.ln_final(word_emb).permute(1, 0, 2).float()
+            enctxt = self.model.encode_text(text).float()
+        return enctxt, word_emb
+clip_model = TextCLIP(clip_model)
+def get_vqvae(args, is_upper_edit):
+    if not is_upper_edit:
+        return vqvae.HumanVQVAE(args, ## use args to define different parameters in different quantizers
+                            args.nb_code,
+                            args.code_dim,
+                            args.output_emb_width,
+                            args.down_t,
+                            args.stride_t,
+                            args.width,
+                            args.depth,
+                            args.dilation_growth_rate)
+    else:
+        return VQVAE_SEP(args, ## use args to define different parameters in different quantizers
+                        args.nb_code,
+                        args.code_dim,
+                        args.output_emb_width,
+                        args.down_t,
+                        args.stride_t,
+                        args.width,
+                        args.depth,
+                        args.dilation_growth_rate,
+                        moment={'mean': torch.from_numpy(args.mean).float(),
+                            'std': torch.from_numpy(args.std).float()},
+                        sep_decoder=True)
+def get_maskdecoder(args, vqvae, is_upper_edit):
+    tranformer = trans if not is_upper_edit else trans_uplow
+    return tranformer.Text2Motion_Transformer(vqvae,
+                                num_vq=args.nb_code,
+                                embed_dim=args.embed_dim_gpt,
+                                clip_dim=args.clip_dim,
+                                block_size=args.block_size,
+                                num_layers=args.num_layers,
+                                num_local_layer=args.num_local_layer,
+                                n_head=args.n_head_gpt,
+                                drop_out_rate=args.drop_out_rate,
+                                fc_rate=args.ff_rate)
+class MMM(torch.nn.Module):
+    def __init__(self, args=None, is_upper_edit=False):
+        super().__init__()
+        self.is_upper_edit = is_upper_edit
+        args.dataname = args.dataset_name = 't2m'
+        self.vqvae = get_vqvae(args, is_upper_edit)
+        ckpt = torch.load(args.resume_pth, map_location='cpu')
+        self.vqvae.load_state_dict(ckpt['net'], strict=True)
+        if is_upper_edit:
+            class VQVAE_WRAPPER(torch.nn.Module):
+                def __init__(self, vqvae) :
+                    super().__init__()
+                    self.vqvae = vqvae
+                def forward(self, *args, **kwargs):
+                    return self.vqvae(*args, **kwargs)
+            self.vqvae = VQVAE_WRAPPER(self.vqvae)
+        self.vqvae.eval()
+        self.vqvae
+        self.maskdecoder = get_maskdecoder(args, self.vqvae, is_upper_edit)
+        ckpt = torch.load(args.resume_trans, map_location='cpu')
+        self.maskdecoder.load_state_dict(ckpt['trans'], strict=True)
+        self.maskdecoder.train()
+        self.maskdecoder
+    def forward(self, text, lengths=-1, rand_pos=True):
+        b = len(text)
+        feat_clip_text = clip.tokenize(text, truncate=True)
+        feat_clip_text, word_emb = clip_model(feat_clip_text)
+        index_motion = self.maskdecoder(feat_clip_text, word_emb, type="sample", m_length=lengths, rand_pos=rand_pos, if_test=False)
+        m_token_length = torch.ceil((lengths)/4).int()
+        pred_pose_all = torch.zeros((b, 196, 263))
+        for k in range(b):
+            pred_pose = self.vqvae(index_motion[k:k+1, :m_token_length[k]], type='decode')
+            pred_pose_all[k:k+1, :int(lengths[k].item())] = pred_pose
+        return pred_pose_all
+    def inbetween_eval(self, base_pose, m_length, start_f, end_f, inbetween_text):
+        bs, seq = base_pose.shape[:2]
+        tokens = -1*torch.ones((bs, 50), dtype=torch.long)
+        m_token_length = torch.ceil((m_length)/4).int()
+        start_t = torch.round((start_f)/4).int()
+        end_t = torch.round((end_f)/4).int()
+        for k in range(bs):
+            index_motion = self.vqvae(base_pose[k:k+1, :m_length[k]], type='encode')
+            tokens[k, :start_t[k]] = index_motion[0][:start_t[k]]
+            tokens[k, end_t[k]:m_token_length[k]] = index_motion[0][end_t[k]:m_token_length[k]]
+        text = clip.tokenize(inbetween_text, truncate=True)
+        feat_clip_text, word_emb_clip = clip_model(text)
+        mask_id = self.maskdecoder.num_vq + 2
+        tokens[tokens==-1] = mask_id
+        inpaint_index = self.maskdecoder(feat_clip_text, word_emb_clip, type="sample", m_length=m_length, token_cond=tokens)
+        pred_pose_eval = torch.zeros((bs, seq, base_pose.shape[-1]))
+        for k in range(bs):
+            pred_pose = self.vqvae(inpaint_index[k:k+1, :m_token_length[k]], type='decode')
+            pred_pose_eval[k:k+1, :int(m_length[k].item())] = pred_pose
+        return pred_pose_eval
+    def long_range(self, text, lengths, num_transition_token=2, output='concat', index_motion=None):
+        b = len(text)
+        feat_clip_text = clip.tokenize(text, truncate=True)
+        feat_clip_text, word_emb = clip_model(feat_clip_text)
+        if index_motion is None:
+            index_motion = self.maskdecoder(feat_clip_text, word_emb, type="sample", m_length=lengths, rand_pos=False)
+        m_token_length = torch.ceil((lengths)/4).int()
+        if output == 'eval':
+            frame_length = m_token_length * 4
+            m_token_length = m_token_length.clone()
+            m_token_length = m_token_length - 2*num_transition_token
+            m_token_length[[0,-1]] += num_transition_token # first and last have transition only half
+        half_token_length = (m_token_length/2).int()
+        idx_full_len = half_token_length >= 24
+        half_token_length[idx_full_len] = half_token_length[idx_full_len] - 1
+        mask_id = self.maskdecoder.num_vq + 2
+        tokens = -1*torch.ones((b-1, 50), dtype=torch.long)
+        transition_train_length = []
+        for i in range(b-1):
+            if output == 'concat':
+                i_index_motion = index_motion[i]
+                i1_index_motion = index_motion[i+1]
+            if output == 'eval':
+                if i == 0:
+                    i_index_motion = index_motion[i, :m_token_length[i]]
+                else:
+                    i_index_motion = index_motion[i, num_transition_token:m_token_length[i] + num_transition_token]
+                if i == b-1:
+                    i1_index_motion = index_motion[i+1, :m_token_length[i+1]]
+                else:
+                    i1_index_motion = index_motion[i+1,
+                                                num_transition_token:m_token_length[i+1] + num_transition_token]
+            left_end = half_token_length[i]
+            right_start = left_end + num_transition_token
+            end = right_start + half_token_length[i+1]
+            tokens[i, :left_end] = i_index_motion[m_token_length[i]-left_end: m_token_length[i]]
+            tokens[i, left_end:right_start] = mask_id
+            tokens[i, right_start:end] = i1_index_motion[:half_token_length[i+1]]
+            transition_train_length.append(end)
+        transition_train_length = torch.tensor(transition_train_length).to(index_motion.device)
+        text = clip.tokenize(text[:-1], truncate=True)
+        feat_clip_text, word_emb_clip = clip_model(text)
+        inpaint_index = self.maskdecoder(feat_clip_text, word_emb_clip, type="sample", m_length=transition_train_length*4, token_cond=tokens, max_steps=1)
+        if output == 'concat':
+            all_tokens = []
+            for i in range(b-1):
+                all_tokens.append(index_motion[i, :m_token_length[i]])
+                all_tokens.append(inpaint_index[i, tokens[i] == mask_id])
+            all_tokens.append(index_motion[-1, :m_token_length[-1]])
+            all_tokens = torch.cat(all_tokens).unsqueeze(0)
+            pred_pose = self.vqvae(all_tokens, type='decode')
+            return pred_pose
+        elif output == 'eval':
+            all_tokens = []
+            for i in range(b):
+                motion_token = index_motion[i, :m_token_length[i]]
+                if i == 0:
+                    first_current_trans_tok = inpaint_index[i, tokens[i] == mask_id]
+                    all_tokens.append(motion_token)
+                    all_tokens.append(first_current_trans_tok)
+                else:
+                    if i < b-1:
+                        first_current_trans_tok = inpaint_index[i, tokens[i] == mask_id]
+                        all_tokens.append(motion_token)
+                        all_tokens.append(first_current_trans_tok)
+                    else:
+                        all_tokens.append(motion_token)
+            all_tokens = torch.cat(all_tokens)
+            pred_pose_concat = self.vqvae(all_tokens.unsqueeze(0), type='decode')
+            trans_frame = num_transition_token*4
+            pred_pose = torch.zeros((b, 196, 263))
+            current_point = 0
+            for i in range(b):
+                if i == 0:
+                    start_f = torch.tensor(0)
+                    end_f = frame_length[i]
+                else:
+                    start_f = current_point - trans_frame
+                    end_f = start_f + frame_length[i]
+                current_point = end_f
+                pred_pose[i, :frame_length[i]] = pred_pose_concat[0, start_f: end_f]
+            return pred_pose
+    def upper_edit(self, pose, m_length, upper_text, lower_mask=None):
+        pose = pose.clone().float() # bs, nb_joints, joints_dim, seq_len
+        m_tokens_len = torch.ceil((m_length)/4)
+        bs, seq = pose.shape[:2]
+        max_motion_length = int(seq/4) + 1
+        mot_end_idx = self.vqvae.vqvae.num_code
+        mot_pad_idx = self.vqvae.vqvae.num_code + 1
+        mask_id = self.vqvae.vqvae.num_code + 2
+        target_lower = []
+        for k in range(bs):
+            target = self.vqvae(pose[k:k+1, :m_length[k]], type='encode')
+            if m_tokens_len[k]+1 < max_motion_length:
+                target = torch.cat([target,
+                                    torch.ones((1, 1, 2), dtype=int, device=target.device) * mot_end_idx,
+                                    torch.ones((1, max_motion_length-1-m_tokens_len[k].int().item(), 2), dtype=int, device=target.device) * mot_pad_idx], axis=1)
+            else:
+                target = torch.cat([target,
+                                    torch.ones((1, 1, 2), dtype=int, device=target.device) * mot_end_idx], axis=1)
+            target_lower.append(target[..., 1])
+        target_lower = torch.cat(target_lower, axis=0)
+        ### lower mask ###
+        if lower_mask is not None:
+            lower_mask = torch.cat([lower_mask, torch.zeros(bs, 1, dtype=int)], dim=1).bool()
+            target_lower_masked = target_lower.clone()
+            target_lower_masked[lower_mask] = mask_id
+            select_end = target_lower == mot_end_idx
+            target_lower_masked[select_end] = target_lower[select_end]
+        else:
+            target_lower_masked = target_lower
+        ##################
+        pred_len = m_length
+        pred_tok_len = m_tokens_len
+        pred_pose_eval = torch.zeros((bs, seq, pose.shape[-1]))
+        # __upper_text__ = ['A man punches with right hand.'] * 32
+        text = clip.tokenize(upper_text, truncate=True)
+        feat_clip_text, word_emb_clip = clip_model(text)
+        # index_motion = trans_encoder(feat_clip_text, idx_lower=target_lower_masked, word_emb=word_emb_clip, type="sample", m_length=pred_len, rand_pos=True, CFG=-1)
+        index_motion = self.maskdecoder(feat_clip_text, target_lower_masked, word_emb_clip, type="sample", m_length=pred_len, rand_pos=True)
+        for i in range(bs):
+            all_tokens = torch.cat([
+                index_motion[i:i+1, :int(pred_tok_len[i].item()), None],
+                target_lower[i:i+1, :int(pred_tok_len[i].item()), None]
+            ], axis=-1)
+            pred_pose = self.vqvae(all_tokens, type='decode')
+            pred_pose_eval[i:i+1, :int(pred_len[i].item())] = pred_pose
+        return pred_pose_eval
+if __name__ == '__main__':
+    args = option_trans.get_args_parser()
+# python generate.py --resume-pth '/home/epinyoan/git/MaskText2Motion/T2M-BD/output/vq/2023-07-19-04-17-17_12_VQVAE_20batchResetNRandom_8192_32/net_last.pth' --resume-trans '/home/epinyoan/git/MaskText2Motion/T2M-BD/output/t2m/2023-10-12-10-11-15_HML3D_45_crsAtt1lyr_40breset_WRONG_THIS_20BRESET/net_last.pth' --text 'the person crouches and walks forward.' --length 156
+    mmm = MMM(args)
+    pred_pose = mmm([args.text], torch.tensor([args.length]), rand_pos=False)
+    std = np.load('./exit/t2m-std.npy')
+    mean = np.load('./exit/t2m-mean.npy')
+    file_name = '_'.join(args.text.split(' '))+'_'+str(args.length)
+    visualize_2motions(pred_pose[0].detach().cpu().numpy(), std, mean, 't2m', args.length, save_path='./output/'+file_name+'.html')

models/encdec.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import torch.nn as nn
+from models.resnet import Resnet1D
+class PrintModule(nn.Module):
+    def __init__(self, me=''):
+        super().__init__()
+        self.me = me
+    def forward(self, x):
+        print(self.me, x.shape)
+        return x
+class Encoder(nn.Module):
+    def __init__(self,
+                 input_emb_width = 3,
+                 output_emb_width = 512,
+                 down_t = 3,
+                 stride_t = 2,
+                 width = 512,
+                 depth = 3,
+                 dilation_growth_rate = 3,
+                 activation='relu',
+                 norm=None):
+        super().__init__()
+        blocks = []
+        filter_t, pad_t = stride_t * 2, stride_t // 2
+        blocks.append(nn.Conv1d(input_emb_width, width, 3, 1, 1))
+        blocks.append(nn.ReLU())
+        for i in range(down_t):
+            input_dim = width
+            block = nn.Sequential(
+                nn.Conv1d(input_dim, width, filter_t, stride_t, pad_t),
+                Resnet1D(width, depth, dilation_growth_rate, activation=activation, norm=norm),
+            )
+            blocks.append(block)
+        blocks.append(nn.Conv1d(width, output_emb_width, 3, 1, 1))
+        self.model = nn.Sequential(*blocks)
+    def forward(self, x):
+        return self.model(x)
+class Decoder(nn.Module):
+    def __init__(self,
+                 input_emb_width = 3,
+                 output_emb_width = 512,
+                 down_t = 3,
+                 stride_t = 2,
+                 width = 512,
+                 depth = 3,
+                 dilation_growth_rate = 3,
+                 activation='relu',
+                 norm=None):
+        super().__init__()
+        blocks = []
+        filter_t, pad_t = stride_t * 2, stride_t // 2
+        blocks.append(nn.Conv1d(output_emb_width, width, 3, 1, 1))
+        blocks.append(nn.ReLU())
+        for i in range(down_t):
+            out_dim = width
+            block = nn.Sequential(
+                Resnet1D(width, depth, dilation_growth_rate, reverse_dilation=True, activation=activation, norm=norm),
+                nn.Upsample(scale_factor=2, mode='nearest'),
+                nn.Conv1d(width, out_dim, 3, 1, 1)
+            )
+            blocks.append(block)
+        blocks.append(nn.Conv1d(width, width, 3, 1, 1))
+        blocks.append(nn.ReLU())
+        blocks.append(nn.Conv1d(width, input_emb_width, 3, 1, 1))
+        self.model = nn.Sequential(*blocks)
+    def forward(self, x):
+        return self.model(x)

models/evaluator_wrapper.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import torch
+from os.path import join as pjoin
+import numpy as np
+from models.modules import MovementConvEncoder, TextEncoderBiGRUCo, MotionEncoderBiGRUCo
+from utils.word_vectorizer import POS_enumerator
+def build_models(opt):
+    movement_enc = MovementConvEncoder(opt.dim_pose-4, opt.dim_movement_enc_hidden, opt.dim_movement_latent)
+    text_enc = TextEncoderBiGRUCo(word_size=opt.dim_word,
+                                  pos_size=opt.dim_pos_ohot,
+                                  hidden_size=opt.dim_text_hidden,
+                                  output_size=opt.dim_coemb_hidden,
+                                  device=opt.device)
+    motion_enc = MotionEncoderBiGRUCo(input_size=opt.dim_movement_latent,
+                                      hidden_size=opt.dim_motion_hidden,
+                                      output_size=opt.dim_coemb_hidden,
+                                      device=opt.device)
+    checkpoint = torch.load(pjoin(opt.checkpoints_dir, opt.dataset_name, 'text_mot_match', 'model', 'finest.tar'),
+                            map_location=opt.device)
+    movement_enc.load_state_dict(checkpoint['movement_encoder'])
+    text_enc.load_state_dict(checkpoint['text_encoder'])
+    motion_enc.load_state_dict(checkpoint['motion_encoder'])
+    print('Loading Evaluation Model Wrapper (Epoch %d) Completed!!' % (checkpoint['epoch']))
+    return text_enc, motion_enc, movement_enc
+class EvaluatorModelWrapper(object):
+    def __init__(self, opt):
+        if opt.dataset_name == 't2m':
+            opt.dim_pose = 263
+        elif opt.dataset_name == 'kit':
+            opt.dim_pose = 251
+        else:
+            raise KeyError('Dataset not Recognized!!!')
+        opt.dim_word = 300
+        opt.max_motion_length = 196
+        opt.dim_pos_ohot = len(POS_enumerator)
+        opt.dim_motion_hidden = 1024
+        opt.max_text_len = 20
+        opt.dim_text_hidden = 512
+        opt.dim_coemb_hidden = 512
+        # print(opt)
+        self.text_encoder, self.motion_encoder, self.movement_encoder = build_models(opt)
+        self.opt = opt
+        self.device = opt.device
+        self.text_encoder.to(opt.device)
+        self.motion_encoder.to(opt.device)
+        self.movement_encoder.to(opt.device)
+        self.text_encoder.eval()
+        self.motion_encoder.eval()
+        self.movement_encoder.eval()
+    # Please note that the results does not following the order of inputs
+    def get_co_embeddings(self, word_embs, pos_ohot, cap_lens, motions, m_lens):
+        with torch.no_grad():
+            word_embs = word_embs.detach().to(self.device).float()
+            pos_ohot = pos_ohot.detach().to(self.device).float()
+            motions = motions.detach().to(self.device).float()
+            '''Movement Encoding'''
+            movements = self.movement_encoder(motions[..., :-4]).detach()
+            m_lens = m_lens // self.opt.unit_length
+            motion_embedding = self.motion_encoder(movements, m_lens)
+            '''Text Encoding'''
+            text_embedding = self.text_encoder(word_embs, pos_ohot, cap_lens)
+        return text_embedding, motion_embedding
+    # Please note that the results does not following the order of inputs
+    def get_motion_embeddings(self, motions, m_lens):
+        with torch.no_grad():
+            motions = motions.detach().to(self.device).float()
+            align_idx = np.argsort(m_lens.data.tolist())[::-1].copy()
+            motions = motions[align_idx]
+            m_lens = m_lens[align_idx]
+            '''Movement Encoding'''
+            movements = self.movement_encoder(motions[..., :-4]).detach()
+            m_lens = m_lens // self.opt.unit_length
+            motion_embedding = self.motion_encoder(movements, m_lens)
+        return motion_embedding

models/modules.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import pack_padded_sequence
+def init_weight(m):
+    if isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear) or isinstance(m, nn.ConvTranspose1d):
+        nn.init.xavier_normal_(m.weight)
+        # m.bias.data.fill_(0.01)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+class MovementConvEncoder(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size):
+        super(MovementConvEncoder, self).__init__()
+        self.main = nn.Sequential(
+            nn.Conv1d(input_size, hidden_size, 4, 2, 1),
+            nn.Dropout(0.2, inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv1d(hidden_size, output_size, 4, 2, 1),
+            nn.Dropout(0.2, inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+        )
+        self.out_net = nn.Linear(output_size, output_size)
+        self.main.apply(init_weight)
+        self.out_net.apply(init_weight)
+    def forward(self, inputs):
+        inputs = inputs.permute(0, 2, 1)
+        outputs = self.main(inputs).permute(0, 2, 1)
+        # print(outputs.shape)
+        return self.out_net(outputs)
+class TextEncoderBiGRUCo(nn.Module):
+    def __init__(self, word_size, pos_size, hidden_size, output_size, device):
+        super(TextEncoderBiGRUCo, self).__init__()
+        self.device = device
+        self.pos_emb = nn.Linear(pos_size, word_size)
+        self.input_emb = nn.Linear(word_size, hidden_size)
+        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True)
+        self.output_net = nn.Sequential(
+            nn.Linear(hidden_size * 2, hidden_size),
+            nn.LayerNorm(hidden_size),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Linear(hidden_size, output_size)
+        )
+        self.input_emb.apply(init_weight)
+        self.pos_emb.apply(init_weight)
+        self.output_net.apply(init_weight)
+        self.hidden_size = hidden_size
+        self.hidden = nn.Parameter(torch.randn((2, 1, self.hidden_size), requires_grad=True))
+    # input(batch_size, seq_len, dim)
+    def forward(self, word_embs, pos_onehot, cap_lens):
+        num_samples = word_embs.shape[0]
+        pos_embs = self.pos_emb(pos_onehot)
+        inputs = word_embs + pos_embs
+        input_embs = self.input_emb(inputs)
+        hidden = self.hidden.repeat(1, num_samples, 1)
+        cap_lens = cap_lens.data.tolist()
+        emb = pack_padded_sequence(input_embs, cap_lens, batch_first=True)
+        gru_seq, gru_last = self.gru(emb, hidden)
+        gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1)
+        return self.output_net(gru_last)
+class MotionEncoderBiGRUCo(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size, device):
+        super(MotionEncoderBiGRUCo, self).__init__()
+        self.device = device
+        self.input_emb = nn.Linear(input_size, hidden_size)
+        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True)
+        self.output_net = nn.Sequential(
+            nn.Linear(hidden_size*2, hidden_size),
+            nn.LayerNorm(hidden_size),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Linear(hidden_size, output_size)
+        )
+        self.input_emb.apply(init_weight)
+        self.output_net.apply(init_weight)
+        self.hidden_size = hidden_size
+        self.hidden = nn.Parameter(torch.randn((2, 1, self.hidden_size), requires_grad=True))
+    # input(batch_size, seq_len, dim)
+    def forward(self, inputs, m_lens):
+        num_samples = inputs.shape[0]
+        input_embs = self.input_emb(inputs)
+        hidden = self.hidden.repeat(1, num_samples, 1)
+        cap_lens = m_lens.data.tolist()
+        emb = pack_padded_sequence(input_embs, cap_lens, batch_first=True, enforce_sorted=False)
+        gru_seq, gru_last = self.gru(emb, hidden)
+        gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1)
+        return self.output_net(gru_last)

models/pos_encoding.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+Various positional encodings for the transformer.
+"""
+import math
+import torch
+from torch import nn
+def PE1d_sincos(seq_length, dim):
+    """
+    :param d_model: dimension of the model
+    :param length: length of positions
+    :return: length*d_model position matrix
+    """
+    if dim % 2 != 0:
+        raise ValueError("Cannot use sin/cos positional encoding with "
+                         "odd dim (got dim={:d})".format(dim))
+    pe = torch.zeros(seq_length, dim)
+    position = torch.arange(0, seq_length).unsqueeze(1)
+    div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) *
+                         -(math.log(10000.0) / dim)))
+    pe[:, 0::2] = torch.sin(position.float() * div_term)
+    pe[:, 1::2] = torch.cos(position.float() * div_term)
+    return pe.unsqueeze(1)
+class PositionEmbedding(nn.Module):
+    """
+    Absolute pos embedding (standard), learned.
+    """
+    def __init__(self, seq_length, dim, dropout, grad=False):
+        super().__init__()
+        self.embed = nn.Parameter(data=PE1d_sincos(seq_length, dim), requires_grad=grad)
+        self.dropout = nn.Dropout(p=dropout)
+    def forward(self, x):
+        # x.shape: bs, seq_len, feat_dim
+        l = x.shape[1]
+        x = x.permute(1, 0, 2) + self.embed[:l].expand(x.permute(1, 0, 2).shape)
+        x = self.dropout(x.permute(1, 0, 2))
+        return x

models/quantize_cnn.py ADDED Viewed

	@@ -0,0 +1,423 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class QuantizeEMAReset(nn.Module):
+    def __init__(self, nb_code, code_dim, args):
+        super().__init__()
+        self.nb_code = nb_code
+        self.code_dim = code_dim
+        self.mu = args.mu
+        self.reset_codebook()
+        self.reset_count = 0
+        self.usage = torch.zeros((self.nb_code, 1))
+    def reset_codebook(self):
+        self.init = False
+        self.code_sum = None
+        self.code_count = None
+        self.register_buffer('codebook', torch.zeros(self.nb_code, self.code_dim))
+    def _tile(self, x):
+        nb_code_x, code_dim = x.shape
+        if nb_code_x < self.nb_code:
+            n_repeats = (self.nb_code + nb_code_x - 1) // nb_code_x
+            std = 0.01 / np.sqrt(code_dim)
+            out = x.repeat(n_repeats, 1)
+            out = out + torch.randn_like(out) * std
+        else :
+            out = x
+        return out
+    def init_codebook(self, x):
+        out = self._tile(x)
+        self.codebook = out[:self.nb_code]
+        self.code_sum = self.codebook.clone()
+        self.code_count = torch.ones(self.nb_code, device=self.codebook.device)
+        self.init = True
+    @torch.no_grad()
+    def compute_perplexity(self, code_idx) :
+        # Calculate new centres
+        code_onehot = torch.zeros(self.nb_code, code_idx.shape[0], device=code_idx.device)  # nb_code, N * L
+        code_onehot.scatter_(0, code_idx.view(1, code_idx.shape[0]), 1)
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+        prob = code_count / torch.sum(code_count)
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+        return perplexity
+    @torch.no_grad()
+    def update_codebook(self, x, code_idx):
+        code_onehot = torch.zeros(self.nb_code, x.shape[0], device=x.device)  # nb_code, N * L
+        code_onehot.scatter_(0, code_idx.view(1, x.shape[0]), 1)
+        code_sum = torch.matmul(code_onehot, x)  # nb_code, w
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+        out = self._tile(x)
+        code_rand = out[torch.randperm(out.shape[0])[:self.nb_code]]
+        # Update centres
+        self.code_sum = self.mu * self.code_sum + (1. - self.mu) * code_sum  # w, nb_code
+        self.code_count = self.mu * self.code_count + (1. - self.mu) * code_count  # nb_code
+        usage = (self.code_count.view(self.nb_code, 1) >= 1.0).float()
+        self.usage = self.usage.to(usage.device)
+        if self.reset_count >= 20:
+            self.reset_count = 0
+            usage = (usage + self.usage >= 1.0).float()
+        else:
+            self.reset_count += 1
+            self.usage = (usage + self.usage >= 1.0).float()
+            usage = torch.ones_like(self.usage, device=x.device)
+        code_update = self.code_sum.view(self.nb_code, self.code_dim) / self.code_count.view(self.nb_code, 1)
+        self.codebook = usage * code_update + (1 - usage) * code_rand
+        prob = code_count / torch.sum(code_count)
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+        return perplexity
+    def preprocess(self, x):
+        # NCT -> NTC -> [NT, C]
+        x = x.permute(0, 2, 1).contiguous()
+        x = x.view(-1, x.shape[-1])
+        return x
+    def quantize(self, x):
+        # Calculate latent code x_l
+        k_w = self.codebook.t()
+        distance = torch.sum(x ** 2, dim=-1, keepdim=True) - 2 * torch.matmul(x, k_w) + torch.sum(k_w ** 2, dim=0,
+                                                                                            keepdim=True)  # (N * L, b)
+        _, code_idx = torch.min(distance, dim=-1)
+        return code_idx
+    def dequantize(self, code_idx):
+        x = F.embedding(code_idx, self.codebook)
+        return x
+    def forward(self, x):
+        N, width, T = x.shape
+        # Preprocess
+        x = self.preprocess(x)
+        # Init codebook if not inited
+        if self.training and not self.init:
+            self.init_codebook(x)
+        # quantize and dequantize through bottleneck
+        code_idx = self.quantize(x)
+        x_d = self.dequantize(code_idx)
+        # Update embeddings
+        if self.training:
+            perplexity = self.update_codebook(x, code_idx)
+        else :
+            perplexity = self.compute_perplexity(code_idx)
+        # Loss
+        commit_loss = F.mse_loss(x, x_d.detach())
+        # Passthrough
+        x_d = x + (x_d - x).detach()
+        # Postprocess
+        x_d = x_d.view(N, T, -1).permute(0, 2, 1).contiguous()   #(N, DIM, T)
+        return x_d, commit_loss, perplexity
+class Quantizer(nn.Module):
+    def __init__(self, n_e, e_dim, beta):
+        super(Quantizer, self).__init__()
+        self.e_dim = e_dim
+        self.n_e = n_e
+        self.beta = beta
+        self.embedding = nn.Embedding(self.n_e, self.e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+    def forward(self, z):
+        N, width, T = z.shape
+        z = self.preprocess(z)
+        assert z.shape[-1] == self.e_dim
+        z_flattened = z.contiguous().view(-1, self.e_dim)
+        # B x V
+        d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \
+            torch.sum(self.embedding.weight**2, dim=1) - 2 * \
+            torch.matmul(z_flattened, self.embedding.weight.t())
+        # B x 1
+        min_encoding_indices = torch.argmin(d, dim=1)
+        z_q = self.embedding(min_encoding_indices).view(z.shape)
+        # compute loss for embedding
+        loss = torch.mean((z_q - z.detach())**2) + self.beta * \
+               torch.mean((z_q.detach() - z)**2)
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+        z_q = z_q.view(N, T, -1).permute(0, 2, 1).contiguous()   #(N, DIM, T)
+        min_encodings = F.one_hot(min_encoding_indices, self.n_e).type(z.dtype)
+        e_mean = torch.mean(min_encodings, dim=0)
+        perplexity = torch.exp(-torch.sum(e_mean*torch.log(e_mean + 1e-10)))
+        return z_q, loss, perplexity
+    def quantize(self, z):
+        assert z.shape[-1] == self.e_dim
+        # B x V
+        d = torch.sum(z ** 2, dim=1, keepdim=True) + \
+            torch.sum(self.embedding.weight ** 2, dim=1) - 2 * \
+            torch.matmul(z, self.embedding.weight.t())
+        # B x 1
+        min_encoding_indices = torch.argmin(d, dim=1)
+        return min_encoding_indices
+    def dequantize(self, indices):
+        index_flattened = indices.view(-1)
+        z_q = self.embedding(index_flattened)
+        z_q = z_q.view(indices.shape + (self.e_dim, )).contiguous()
+        return z_q
+    def preprocess(self, x):
+        # NCT -> NTC -> [NT, C]
+        x = x.permute(0, 2, 1).contiguous()
+        x = x.view(-1, x.shape[-1])
+        return x
+class QuantizeReset(nn.Module):
+    def __init__(self, nb_code, code_dim, args):
+        super().__init__()
+        self.nb_code = nb_code
+        self.code_dim = code_dim
+        self.reset_codebook()
+        self.codebook = nn.Parameter(torch.randn(nb_code, code_dim))
+    def reset_codebook(self):
+        self.init = False
+        self.code_count = None
+    def _tile(self, x):
+        nb_code_x, code_dim = x.shape
+        if nb_code_x < self.nb_code:
+            n_repeats = (self.nb_code + nb_code_x - 1) // nb_code_x
+            std = 0.01 / np.sqrt(code_dim)
+            out = x.repeat(n_repeats, 1)
+            out = out + torch.randn_like(out) * std
+        else :
+            out = x
+        return out
+    def init_codebook(self, x):
+        out = self._tile(x)
+        self.codebook = nn.Parameter(out[:self.nb_code])
+        self.code_count = torch.ones(self.nb_code, device=self.codebook.device)
+        self.init = True
+    @torch.no_grad()
+    def compute_perplexity(self, code_idx) :
+        # Calculate new centres
+        code_onehot = torch.zeros(self.nb_code, code_idx.shape[0], device=code_idx.device)  # nb_code, N * L
+        code_onehot.scatter_(0, code_idx.view(1, code_idx.shape[0]), 1)
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+        prob = code_count / torch.sum(code_count)
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+        return perplexity
+    def update_codebook(self, x, code_idx):
+        code_onehot = torch.zeros(self.nb_code, x.shape[0], device=x.device)  # nb_code, N * L
+        code_onehot.scatter_(0, code_idx.view(1, x.shape[0]), 1)
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+        out = self._tile(x)
+        code_rand = out[:self.nb_code]
+        # Update centres
+        self.code_count = code_count  # nb_code
+        usage = (self.code_count.view(self.nb_code, 1) >= 1.0).float()
+        self.codebook.data = usage * self.codebook.data + (1 - usage) * code_rand
+        prob = code_count / torch.sum(code_count)
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+        return perplexity
+    def preprocess(self, x):
+        # NCT -> NTC -> [NT, C]
+        x = x.permute(0, 2, 1).contiguous()
+        x = x.view(-1, x.shape[-1])
+        return x
+    def quantize(self, x):
+        # Calculate latent code x_l
+        k_w = self.codebook.t()
+        distance = torch.sum(x ** 2, dim=-1, keepdim=True) - 2 * torch.matmul(x, k_w) + torch.sum(k_w ** 2, dim=0,
+                                                                                            keepdim=True)  # (N * L, b)
+        _, code_idx = torch.min(distance, dim=-1)
+        return code_idx
+    def dequantize(self, code_idx):
+        x = F.embedding(code_idx, self.codebook)
+        return x
+    def forward(self, x):
+        N, width, T = x.shape
+        # Preprocess
+        x = self.preprocess(x)
+        # Init codebook if not inited
+        if self.training and not self.init:
+            self.init_codebook(x)
+        # quantize and dequantize through bottleneck
+        code_idx = self.quantize(x)
+        x_d = self.dequantize(code_idx)
+        # Update embeddings
+        if self.training:
+            perplexity = self.update_codebook(x, code_idx)
+        else :
+            perplexity = self.compute_perplexity(code_idx)
+        # Loss
+        commit_loss = F.mse_loss(x, x_d.detach())
+        # Passthrough
+        x_d = x + (x_d - x).detach()
+        # Postprocess
+        x_d = x_d.view(N, T, -1).permute(0, 2, 1).contiguous()   #(N, DIM, T)
+        return x_d, commit_loss, perplexity
+class QuantizeEMA(nn.Module):
+    def __init__(self, nb_code, code_dim, args):
+        super().__init__()
+        self.nb_code = nb_code
+        self.code_dim = code_dim
+        self.mu = 0.99
+        self.reset_codebook()
+    def reset_codebook(self):
+        self.init = False
+        self.code_sum = None
+        self.code_count = None
+        self.register_buffer('codebook', torch.zeros(self.nb_code, self.code_dim).cuda())
+    def _tile(self, x):
+        nb_code_x, code_dim = x.shape
+        if nb_code_x < self.nb_code:
+            n_repeats = (self.nb_code + nb_code_x - 1) // nb_code_x
+            std = 0.01 / np.sqrt(code_dim)
+            out = x.repeat(n_repeats, 1)
+            out = out + torch.randn_like(out) * std
+        else :
+            out = x
+        return out
+    def init_codebook(self, x):
+        out = self._tile(x)
+        self.codebook = out[:self.nb_code]
+        self.code_sum = self.codebook.clone()
+        self.code_count = torch.ones(self.nb_code, device=self.codebook.device)
+        self.init = True
+    @torch.no_grad()
+    def compute_perplexity(self, code_idx) :
+        # Calculate new centres
+        code_onehot = torch.zeros(self.nb_code, code_idx.shape[0], device=code_idx.device)  # nb_code, N * L
+        code_onehot.scatter_(0, code_idx.view(1, code_idx.shape[0]), 1)
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+        prob = code_count / torch.sum(code_count)
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+        return perplexity
+    @torch.no_grad()
+    def update_codebook(self, x, code_idx):
+        code_onehot = torch.zeros(self.nb_code, x.shape[0], device=x.device)  # nb_code, N * L
+        code_onehot.scatter_(0, code_idx.view(1, x.shape[0]), 1)
+        code_sum = torch.matmul(code_onehot, x)  # nb_code, w
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+        # Update centres
+        self.code_sum = self.mu * self.code_sum + (1. - self.mu) * code_sum  # w, nb_code
+        self.code_count = self.mu * self.code_count + (1. - self.mu) * code_count  # nb_code
+        code_update = self.code_sum.view(self.nb_code, self.code_dim) / self.code_count.view(self.nb_code, 1)
+        self.codebook = code_update
+        prob = code_count / torch.sum(code_count)
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+        return perplexity
+    def preprocess(self, x):
+        # NCT -> NTC -> [NT, C]
+        x = x.permute(0, 2, 1).contiguous()
+        x = x.view(-1, x.shape[-1])
+        return x
+    def quantize(self, x):
+        # Calculate latent code x_l
+        k_w = self.codebook.t()
+        distance = torch.sum(x ** 2, dim=-1, keepdim=True) - 2 * torch.matmul(x, k_w) + torch.sum(k_w ** 2, dim=0,
+                                                                                            keepdim=True)  # (N * L, b)
+        _, code_idx = torch.min(distance, dim=-1)
+        return code_idx
+    def dequantize(self, code_idx):
+        x = F.embedding(code_idx, self.codebook)
+        return x
+    def forward(self, x):
+        N, width, T = x.shape
+        # Preprocess
+        x = self.preprocess(x)
+        # Init codebook if not inited
+        if self.training and not self.init:
+            self.init_codebook(x)
+        # quantize and dequantize through bottleneck
+        code_idx = self.quantize(x)
+        x_d = self.dequantize(code_idx)
+        # Update embeddings
+        if self.training:
+            perplexity = self.update_codebook(x, code_idx)
+        else :
+            perplexity = self.compute_perplexity(code_idx)
+        # Loss
+        commit_loss = F.mse_loss(x, x_d.detach())
+        # Passthrough
+        x_d = x + (x_d - x).detach()
+        # Postprocess
+        x_d = x_d.view(N, T, -1).permute(0, 2, 1).contiguous()   #(N, DIM, T)
+        return x_d, commit_loss, perplexity

models/resnet.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import torch.nn as nn
+import torch
+class nonlinearity(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        # swish
+        return x * torch.sigmoid(x)
+class ResConv1DBlock(nn.Module):
+    def __init__(self, n_in, n_state, dilation=1, activation='silu', norm=None, dropout=None):
+        super().__init__()
+        padding = dilation
+        self.norm = norm
+        if norm == "LN":
+            self.norm1 = nn.LayerNorm(n_in)
+            self.norm2 = nn.LayerNorm(n_in)
+        elif norm == "GN":
+            self.norm1 = nn.GroupNorm(num_groups=32, num_channels=n_in, eps=1e-6, affine=True)
+            self.norm2 = nn.GroupNorm(num_groups=32, num_channels=n_in, eps=1e-6, affine=True)
+        elif norm == "BN":
+            self.norm1 = nn.BatchNorm1d(num_features=n_in, eps=1e-6, affine=True)
+            self.norm2 = nn.BatchNorm1d(num_features=n_in, eps=1e-6, affine=True)
+        else:
+            self.norm1 = nn.Identity()
+            self.norm2 = nn.Identity()
+        if activation == "relu":
+            self.activation1 = nn.ReLU()
+            self.activation2 = nn.ReLU()
+        elif activation == "silu":
+            self.activation1 = nonlinearity()
+            self.activation2 = nonlinearity()
+        elif activation == "gelu":
+            self.activation1 = nn.GELU()
+            self.activation2 = nn.GELU()
+        self.conv1 = nn.Conv1d(n_in, n_state, 3, 1, padding, dilation)
+        self.conv2 = nn.Conv1d(n_state, n_in, 1, 1, 0,)
+    def forward(self, x):
+        x_orig = x
+        if self.norm == "LN":
+            x = self.norm1(x.transpose(-2, -1))
+            x = self.activation1(x.transpose(-2, -1))
+        else:
+            x = self.norm1(x)
+            x = self.activation1(x)
+        x = self.conv1(x)
+        if self.norm == "LN":
+            x = self.norm2(x.transpose(-2, -1))
+            x = self.activation2(x.transpose(-2, -1))
+        else:
+            x = self.norm2(x)
+            x = self.activation2(x)
+        x = self.conv2(x)
+        x = x + x_orig
+        return x
+class Resnet1D(nn.Module):
+    def __init__(self, n_in, n_depth, dilation_growth_rate=1, reverse_dilation=True, activation='relu', norm=None):
+        super().__init__()
+        blocks = [ResConv1DBlock(n_in, n_in, dilation=dilation_growth_rate ** depth, activation=activation, norm=norm) for depth in range(n_depth)]
+        if reverse_dilation:
+            blocks = blocks[::-1]
+        self.model = nn.Sequential(*blocks)
+    def forward(self, x):
+        return self.model(x)

models/t2m_trans.py ADDED Viewed

	@@ -0,0 +1,626 @@

+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.distributions import Categorical
+import models.pos_encoding as pos_encoding
+from exit.utils import cosine_schedule, uniform, top_k, gumbel_sample, top_p
+from tqdm import tqdm
+from einops import rearrange, repeat
+from exit.utils import get_model, generate_src_mask
+class PatchUpSampling(nn.Module):
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.up_sampling = nn.Linear(dim, 4 * dim, bias=False)
+        self.norm = norm_layer(dim)
+    def forward(self, x):
+        """
+        x: B, F, C
+        """
+        x = self.norm(x)
+        x = self.up_sampling(x)
+        x0 = x[:, :, 0::4]
+        x1 = x[:, :, 1::4]
+        x2 = x[:, :, 2::4]
+        x3 = x[:, :, 3::4]
+        x = torch.cat([x0, x1, x2, x3], 1)
+        return x
+class Decoder_Transformer(nn.Module):
+    def __init__(self,
+                code_dim=1024,
+                embed_dim=512,
+                output_dim=263,
+                block_size=16,
+                num_layers=2,
+                n_head=8,
+                drop_out_rate=0.1,
+                fc_rate=4):
+        super().__init__()
+        self.joint_embed = nn.Linear(code_dim, embed_dim)
+        self.drop = nn.Dropout(drop_out_rate)
+        # transformer block
+        self.blocks = nn.Sequential(*[Block(embed_dim, block_size, n_head, drop_out_rate, fc_rate) for _ in range(num_layers)])
+        self.up_sample = PatchUpSampling(embed_dim)
+        self.pos_embed = pos_encoding.PositionEmbedding(block_size, embed_dim, 0.0, False)
+        self.head = nn.Sequential(nn.LayerNorm(embed_dim),
+                            nn.Linear(embed_dim, output_dim))
+        self.block_size = block_size
+        self.n_head = n_head
+        self.apply(self._init_weights)
+    def get_block_size(self):
+        return self.block_size
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self, token_embeddings):
+        # token_embeddings = self.tok_emb(idx)
+        # B, T = src_mask.shape
+        # src_mask = src_mask.view(B, 1, 1, T).repeat(1, self.n_head, T, 1)
+        token_embeddings = token_embeddings.permute(0, 2, 1)
+        token_embeddings = self.joint_embed(token_embeddings)
+        x = self.pos_embed(token_embeddings)
+        for block in self.blocks:
+            x = block(x)
+        x = self.up_sample(x)
+        x = self.head(x).permute(0, 2, 1)
+        return x
+# https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py#L342C9-L343C33
+class PatchMerging(nn.Module):
+    def __init__(self, input_feats, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * input_feats, dim, bias=False)
+        self.norm = norm_layer(4 * input_feats)
+    def forward(self, x):
+        """
+        x: B, F, C
+        """
+        x0 = x[:, 0::4, :]  # B F/2 C
+        x1 = x[:, 1::4, :]
+        x2 = x[:, 2::4, :]  # B F/2 C
+        x3 = x[:, 3::4, :]
+        x = torch.cat([x0, x1, x2, x3], -1)  # B F/2 2*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class Encoder_Transformer(nn.Module):
+    def __init__(self,
+                input_feats=1024,
+                embed_dim=512,
+                output_dim=263,
+                block_size=16,
+                num_layers=2,
+                n_head=8,
+                drop_out_rate=0.1,
+                fc_rate=4):
+        super().__init__()
+        self.joint_embed = nn.Linear(input_feats, embed_dim)
+        self.drop = nn.Dropout(drop_out_rate)
+        # transformer block
+        self.blocks = nn.Sequential(*[Block(embed_dim, block_size, n_head, drop_out_rate, fc_rate) for _ in range(num_layers)])
+        self.weighted_mean_norm = nn.LayerNorm(embed_dim)
+        self.weighted_mean = torch.nn.Conv1d(in_channels=block_size, out_channels=1, kernel_size=1)
+        self.pos_embed = pos_encoding.PositionEmbedding(block_size, embed_dim, 0.0, False)
+        self.head = nn.Sequential(nn.LayerNorm(embed_dim),
+                            nn.Linear(embed_dim, output_dim))
+        self.block_size = block_size
+        self.n_head = n_head
+        self.apply(self._init_weights)
+    def get_block_size(self):
+        return self.block_size
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self, joints):
+        # B, T = src_mask.shape
+        joints = joints.permute(0,2,1)
+        # token_embeddings = self.joint_embed(joints)
+        block_step_len = int(len(self.blocks)/3)
+        x = self.joint_embed(joints)
+        token_len = int(x.shape[1]/self.block_size)
+        _original_shape = list(x.shape)
+        x = x.view(x.shape[0]*token_len, self.block_size, -1)
+        x = self.pos_embed(x)
+        for block in self.blocks:
+            x = block(x)
+        x = self.weighted_mean_norm(x)
+        x = self.weighted_mean(x)
+        _original_shape[1] = int(_original_shape[1] / self.block_size)
+        x = x.view(*_original_shape)
+        x = self.head(x).permute(0, 2, 1)
+        return x
+class Text2Motion_Transformer(nn.Module):
+    def __init__(self,
+                vqvae,
+                num_vq=1024,
+                embed_dim=512,
+                clip_dim=512,
+                block_size=16,
+                num_layers=2,
+                num_local_layer=0,
+                n_head=8,
+                drop_out_rate=0.1,
+                fc_rate=4):
+        super().__init__()
+        self.n_head = n_head
+        self.trans_base = CrossCondTransBase(vqvae, num_vq, embed_dim, clip_dim, block_size, num_layers, num_local_layer, n_head, drop_out_rate, fc_rate)
+        self.trans_head = CrossCondTransHead(num_vq, embed_dim, block_size, num_layers, n_head, drop_out_rate, fc_rate)
+        self.block_size = block_size
+        self.num_vq = num_vq
+        # self.skip_trans = Skip_Connection_Transformer(num_vq, embed_dim, clip_dim, block_size, num_layers, n_head, drop_out_rate, fc_rate)
+    def get_block_size(self):
+        return self.block_size
+    def forward(self, *args, type='forward', **kwargs):
+        '''type=[forward, sample]'''
+        if type=='forward':
+            return self.forward_function(*args, **kwargs)
+        elif type=='sample':
+            return self.sample(*args, **kwargs)
+        elif type=='inpaint':
+            return self.inpaint(*args, **kwargs)
+        else:
+            raise ValueError(f'Unknown "{type}" type')
+    def get_attn_mask(self, src_mask, att_txt=None):
+        if att_txt is None:
+            att_txt = torch.tensor([[True]]*src_mask.shape[0]).to(src_mask.device)
+        src_mask = torch.cat([att_txt, src_mask],  dim=1)
+        B, T = src_mask.shape
+        src_mask = src_mask.view(B, 1, 1, T).repeat(1, self.n_head, T, 1)
+        return src_mask
+    def forward_function(self, idxs, clip_feature, src_mask=None, att_txt=None, word_emb=None):
+        if src_mask is not None:
+            src_mask = self.get_attn_mask(src_mask, att_txt)
+        feat = self.trans_base(idxs, clip_feature, src_mask, word_emb)
+        logits = self.trans_head(feat, src_mask)
+        return logits
+    def sample(self, clip_feature, word_emb, m_length=None, if_test=False, rand_pos=True, CFG=-1, token_cond=None, max_steps = 10):
+        max_length = 49
+        batch_size = clip_feature.shape[0]
+        mask_id = self.num_vq + 2
+        pad_id = self.num_vq + 1
+        end_id = self.num_vq
+        shape = (batch_size, self.block_size - 1)
+        topk_filter_thres = .9
+        starting_temperature = 1.0
+        scores = torch.ones(shape, dtype = torch.float32, device = clip_feature.device)
+        m_tokens_len = torch.ceil((m_length)/4).long()
+        src_token_mask = generate_src_mask(self.block_size-1, m_tokens_len+1)
+        src_token_mask_noend = generate_src_mask(self.block_size-1, m_tokens_len)
+        if token_cond is not None:
+            ids = token_cond.clone()
+            ids[~src_token_mask_noend] = pad_id
+            num_token_cond = (ids==mask_id).sum(-1)
+        else:
+            ids = torch.full(shape, mask_id, dtype = torch.long, device = clip_feature.device)
+        # [TODO] confirm that these 2 lines are not neccessary (repeated below and maybe don't need them at all)
+        ids[~src_token_mask] = pad_id # [INFO] replace with pad id
+        ids.scatter_(-1, m_tokens_len[..., None].long(), end_id) # [INFO] replace with end id
+        sample_max_steps = torch.round(max_steps/max_length*m_tokens_len) + 1e-8
+        for step in range(max_steps):
+            timestep = torch.clip(step/(sample_max_steps), max=1)
+            if len(m_tokens_len)==1 and step > 0 and torch.clip(step-1/(sample_max_steps), max=1).cpu().item() == timestep:
+                break
+            rand_mask_prob = cosine_schedule(timestep) # timestep #
+            num_token_masked = (rand_mask_prob * m_tokens_len).long().clip(min=1)
+            if token_cond is not None:
+                num_token_masked = (rand_mask_prob * num_token_cond).long().clip(min=1)
+                scores[token_cond!=mask_id] = 0
+            # [INFO] rm no motion frames
+            scores[~src_token_mask_noend] = 0
+            scores = scores/scores.sum(-1)[:, None] # normalize only unmasked token
+            # if rand_pos:
+            #     sorted_score_indices = scores.multinomial(scores.shape[-1], replacement=False) # stocastic
+            # else:
+            sorted, sorted_score_indices = scores.sort(descending=True) # deterministic
+            ids[~src_token_mask] = pad_id # [INFO] replace with pad id
+            ids.scatter_(-1, m_tokens_len[..., None].long(), end_id) # [INFO] replace with end id
+            ## [INFO] Replace "mask_id" to "ids" that have highest "num_token_masked" "scores"
+            select_masked_indices = generate_src_mask(sorted_score_indices.shape[1], num_token_masked)
+            # [INFO] repeat last_id to make it scatter_ the existing last ids.
+            last_index = sorted_score_indices.gather(-1, num_token_masked.unsqueeze(-1)-1)
+            sorted_score_indices = sorted_score_indices * select_masked_indices + (last_index*~select_masked_indices)
+            ids.scatter_(-1, sorted_score_indices, mask_id)
+            logits = self.forward(ids, clip_feature, src_token_mask, word_emb=word_emb)[:,1:]
+            filtered_logits = logits #top_p(logits, .5) # #top_k(logits, topk_filter_thres)
+            if rand_pos:
+                temperature = 1 #starting_temperature * (steps_until_x0 / timesteps) # temperature is annealed
+            else:
+                temperature = 0 #starting_temperature * (steps_until_x0 / timesteps) # temperature is annealed
+            # [INFO] if temperature==0: is equal to argmax (filtered_logits.argmax(dim = -1))
+            # pred_ids = filtered_logits.argmax(dim = -1)
+            pred_ids = gumbel_sample(filtered_logits, temperature = temperature, dim = -1)
+            is_mask = ids == mask_id
+            ids = torch.where(
+                        is_mask,
+                        pred_ids,
+                        ids
+                    )
+            # if timestep == 1.:
+            #     print(probs_without_temperature.shape)
+            probs_without_temperature = logits.softmax(dim = -1)
+            scores = 1 - probs_without_temperature.gather(-1, pred_ids[..., None])
+            scores = rearrange(scores, '... 1 -> ...')
+            scores = scores.masked_fill(~is_mask, 0)
+        if if_test:
+            return ids
+        return ids
+    def inpaint(self, first_tokens, last_tokens, clip_feature=None, word_emb=None, inpaint_len=2, rand_pos=False):
+        # support only one sample
+        assert first_tokens.shape[0] == 1
+        assert last_tokens.shape[0] == 1
+        max_steps = 20
+        max_length = 49
+        batch_size = first_tokens.shape[0]
+        mask_id = self.num_vq + 2
+        pad_id = self.num_vq + 1
+        end_id = self.num_vq
+        shape = (batch_size, self.block_size - 1)
+        scores = torch.ones(shape, dtype = torch.float32, device = first_tokens.device)
+        # force add first / last tokens
+        first_partition_pos_idx = first_tokens.shape[1]
+        second_partition_pos_idx = first_partition_pos_idx + inpaint_len
+        end_pos_idx = second_partition_pos_idx + last_tokens.shape[1]
+        m_tokens_len = torch.ones(batch_size, device = first_tokens.device)*end_pos_idx
+        src_token_mask = generate_src_mask(self.block_size-1, m_tokens_len+1)
+        src_token_mask_noend = generate_src_mask(self.block_size-1, m_tokens_len)
+        ids = torch.full(shape, mask_id, dtype = torch.long, device = first_tokens.device)
+        ids[:, :first_partition_pos_idx] = first_tokens
+        ids[:, second_partition_pos_idx:end_pos_idx] = last_tokens
+        src_token_mask_noend[:, :first_partition_pos_idx] = False
+        src_token_mask_noend[:, second_partition_pos_idx:end_pos_idx] = False
+        # [TODO] confirm that these 2 lines are not neccessary (repeated below and maybe don't need them at all)
+        ids[~src_token_mask] = pad_id # [INFO] replace with pad id
+        ids.scatter_(-1, m_tokens_len[..., None].long(), end_id) # [INFO] replace with end id
+        temp = []
+        sample_max_steps = torch.round(max_steps/max_length*m_tokens_len) + 1e-8
+        if clip_feature is None:
+            clip_feature = torch.zeros(1, 512).to(first_tokens.device)
+            att_txt = torch.zeros((batch_size,1), dtype=torch.bool, device = first_tokens.device)
+        else:
+            att_txt = torch.ones((batch_size,1), dtype=torch.bool, device = first_tokens.device)
+        for step in range(max_steps):
+            timestep = torch.clip(step/(sample_max_steps), max=1)
+            rand_mask_prob = cosine_schedule(timestep) # timestep #
+            num_token_masked = (rand_mask_prob * m_tokens_len).long().clip(min=1)
+            # [INFO] rm no motion frames
+            scores[~src_token_mask_noend] = 0
+            # [INFO] rm begin and end frames
+            scores[:, :first_partition_pos_idx] = 0
+            scores[:, second_partition_pos_idx:end_pos_idx] = 0
+            scores = scores/scores.sum(-1)[:, None] # normalize only unmasked token
+            sorted, sorted_score_indices = scores.sort(descending=True) # deterministic
+            ids[~src_token_mask] = pad_id # [INFO] replace with pad id
+            ids.scatter_(-1, m_tokens_len[..., None].long(), end_id) # [INFO] replace with end id
+            ## [INFO] Replace "mask_id" to "ids" that have highest "num_token_masked" "scores"
+            select_masked_indices = generate_src_mask(sorted_score_indices.shape[1], num_token_masked)
+            # [INFO] repeat last_id to make it scatter_ the existing last ids.
+            last_index = sorted_score_indices.gather(-1, num_token_masked.unsqueeze(-1)-1)
+            sorted_score_indices = sorted_score_indices * select_masked_indices + (last_index*~select_masked_indices)
+            ids.scatter_(-1, sorted_score_indices, mask_id)
+            # [TODO] force replace begin/end tokens b/c the num mask will be more than actual inpainting frames
+            ids[:, :first_partition_pos_idx] = first_tokens
+            ids[:, second_partition_pos_idx:end_pos_idx] = last_tokens
+            logits = self.forward(ids, clip_feature, src_token_mask, word_emb=word_emb)[:,1:]
+            filtered_logits = logits #top_k(logits, topk_filter_thres)
+            if rand_pos:
+                temperature = 1 #starting_temperature * (steps_until_x0 / timesteps) # temperature is annealed
+            else:
+                temperature = 0 #starting_temperature * (steps_until_x0 / timesteps) # temperature is annealed
+            # [INFO] if temperature==0: is equal to argmax (filtered_logits.argmax(dim = -1))
+            # pred_ids = filtered_logits.argmax(dim = -1)
+            pred_ids = gumbel_sample(filtered_logits, temperature = temperature, dim = -1)
+            is_mask = ids == mask_id
+            temp.append(is_mask[:1])
+            ids = torch.where(
+                        is_mask,
+                        pred_ids,
+                        ids
+                    )
+            probs_without_temperature = logits.softmax(dim = -1)
+            scores = 1 - probs_without_temperature.gather(-1, pred_ids[..., None])
+            scores = rearrange(scores, '... 1 -> ...')
+            scores = scores.masked_fill(~is_mask, 0)
+        return ids
+class Attention(nn.Module):
+    def __init__(self, embed_dim=512, block_size=16, n_head=8, drop_out_rate=0.1):
+        super().__init__()
+        assert embed_dim % 8 == 0
+        # key, query, value projections for all heads
+        self.key = nn.Linear(embed_dim, embed_dim)
+        self.query = nn.Linear(embed_dim, embed_dim)
+        self.value = nn.Linear(embed_dim, embed_dim)
+        self.attn_drop = nn.Dropout(drop_out_rate)
+        self.resid_drop = nn.Dropout(drop_out_rate)
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.n_head = n_head
+    def forward(self, x, src_mask):
+        B, T, C = x.size()
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        if src_mask is not None:
+            att[~src_mask] = float('-inf')
+        att = F.softmax(att, dim=-1)
+        att = self.attn_drop(att)
+        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+        # output projection
+        y = self.resid_drop(self.proj(y))
+        return y
+class Block(nn.Module):
+    def __init__(self, embed_dim=512, block_size=16, n_head=8, drop_out_rate=0.1, fc_rate=4):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(embed_dim)
+        self.ln2 = nn.LayerNorm(embed_dim)
+        self.attn = Attention(embed_dim, block_size, n_head, drop_out_rate)
+        self.mlp = nn.Sequential(
+            nn.Linear(embed_dim, fc_rate * embed_dim),
+            nn.GELU(),
+            nn.Linear(fc_rate * embed_dim, embed_dim),
+            nn.Dropout(drop_out_rate),
+        )
+    def forward(self, x, src_mask=None):
+        x = x + self.attn(self.ln1(x), src_mask)
+        x = x + self.mlp(self.ln2(x))
+        return x
+class CrossAttention(nn.Module):
+    def __init__(self, embed_dim=512, block_size=16, n_head=8, drop_out_rate=0.1):
+        super().__init__()
+        assert embed_dim % 8 == 0
+        # key, query, value projections for all heads
+        self.key = nn.Linear(embed_dim, embed_dim)
+        self.query = nn.Linear(embed_dim, embed_dim)
+        self.value = nn.Linear(embed_dim, embed_dim)
+        self.attn_drop = nn.Dropout(drop_out_rate)
+        self.resid_drop = nn.Dropout(drop_out_rate)
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        # causal mask to ensure that attention is only applied to the left in the input sequence
+        self.register_buffer("mask", torch.tril(torch.ones(block_size, 77)).view(1, 1, block_size, 77))
+        self.n_head = n_head
+    def forward(self, x,word_emb):
+        B, T, C = x.size()
+        B, N, D = word_emb.size()
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        k = self.key(word_emb).view(B, N, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        v = self.value(word_emb).view(B, N, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, N) -> (B, nh, T, N)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = F.softmax(att, dim=-1)
+        att = self.attn_drop(att)
+        y = att @ v # (B, nh, T, N) x (B, nh, N, hs) -> (B, nh, T, hs)
+        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+        # output projection
+        y = self.resid_drop(self.proj(y))
+        return y
+class Block_crossatt(nn.Module):
+    def __init__(self, embed_dim=512, block_size=16, n_head=8, drop_out_rate=0.1, fc_rate=4):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(embed_dim)
+        self.ln2 = nn.LayerNorm(embed_dim)
+        self.ln3 = nn.LayerNorm(embed_dim)
+        self.attn = CrossAttention(embed_dim, block_size, n_head, drop_out_rate)
+        self.mlp = nn.Sequential(
+            nn.Linear(embed_dim, fc_rate * embed_dim),
+            nn.GELU(),
+            nn.Linear(fc_rate * embed_dim, embed_dim),
+            nn.Dropout(drop_out_rate),
+        )
+    def forward(self, x,word_emb):
+        x = x + self.attn(self.ln1(x), self.ln3(word_emb))
+        x = x + self.mlp(self.ln2(x))
+        return x
+class CrossCondTransBase(nn.Module):
+    def __init__(self,
+                vqvae,
+                num_vq=1024,
+                embed_dim=512,
+                clip_dim=512,
+                block_size=16,
+                num_layers=2,
+                num_local_layer = 1,
+                n_head=8,
+                drop_out_rate=0.1,
+                fc_rate=4):
+        super().__init__()
+        self.vqvae = vqvae
+        # self.tok_emb = nn.Embedding(num_vq + 3, embed_dim).requires_grad_(False)
+        self.learn_tok_emb = nn.Embedding(3, self.vqvae.vqvae.code_dim)# [INFO] 3 = [end_id, blank_id, mask_id]
+        self.to_emb = nn.Linear(self.vqvae.vqvae.code_dim, embed_dim)
+        self.cond_emb = nn.Linear(clip_dim, embed_dim)
+        self.pos_embedding = nn.Embedding(block_size, embed_dim)
+        self.drop = nn.Dropout(drop_out_rate)
+        # transformer block
+        self.blocks = nn.Sequential(*[Block(embed_dim, block_size, n_head, drop_out_rate, fc_rate) for _ in range(num_layers-num_local_layer)])
+        self.pos_embed = pos_encoding.PositionEmbedding(block_size, embed_dim, 0.0, False)
+        self.num_local_layer = num_local_layer
+        if num_local_layer > 0:
+            self.word_emb = nn.Linear(clip_dim, embed_dim)
+            self.cross_att = nn.Sequential(*[Block_crossatt(embed_dim, block_size, n_head, drop_out_rate, fc_rate) for _ in range(num_local_layer)])
+        self.block_size = block_size
+        self.apply(self._init_weights)
+    def get_block_size(self):
+        return self.block_size
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self, idx, clip_feature, src_mask, word_emb):
+        if len(idx) == 0:
+            token_embeddings = self.cond_emb(clip_feature).unsqueeze(1)
+        else:
+            b, t = idx.size()
+            assert t <= self.block_size, "Cannot forward, model block size is exhausted."
+            # forward the Trans model
+            not_learn_idx = idx<self.vqvae.vqvae.num_code
+            learn_idx = ~not_learn_idx
+            token_embeddings = torch.empty((*idx.shape, self.vqvae.vqvae.code_dim), device=idx.device)
+            token_embeddings[not_learn_idx] = self.vqvae.vqvae.quantizer.dequantize(idx[not_learn_idx]).requires_grad_(False)
+            token_embeddings[learn_idx] = self.learn_tok_emb(idx[learn_idx]-self.vqvae.vqvae.num_code)
+            token_embeddings = self.to_emb(token_embeddings)
+            if self.num_local_layer > 0:
+                word_emb = self.word_emb(word_emb)
+                token_embeddings = self.pos_embed(token_embeddings)
+                for module in self.cross_att:
+                    token_embeddings = module(token_embeddings, word_emb)
+            token_embeddings = torch.cat([self.cond_emb(clip_feature).unsqueeze(1), token_embeddings], dim=1)
+        x = self.pos_embed(token_embeddings)
+        for block in self.blocks:
+            x = block(x, src_mask)
+        return x
+class CrossCondTransHead(nn.Module):
+    def __init__(self,
+                num_vq=1024,
+                embed_dim=512,
+                block_size=16,
+                num_layers=2,
+                n_head=8,
+                drop_out_rate=0.1,
+                fc_rate=4):
+        super().__init__()
+        self.blocks = nn.Sequential(*[Block(embed_dim, block_size, n_head, drop_out_rate, fc_rate) for _ in range(num_layers)])
+        self.ln_f = nn.LayerNorm(embed_dim)
+        self.head = nn.Linear(embed_dim, num_vq, bias=False)
+        self.block_size = block_size
+        self.apply(self._init_weights)
+    def get_block_size(self):
+        return self.block_size
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self, x, src_mask):
+        for block in self.blocks:
+            x = block(x, src_mask)
+        x = self.ln_f(x)
+        logits = self.head(x)
+        return logits

models/t2m_trans_uplow.py ADDED Viewed

	@@ -0,0 +1,637 @@

+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.distributions import Categorical
+import models.pos_encoding as pos_encoding
+from exit.utils import cosine_schedule, uniform, top_k, gumbel_sample, top_p
+from tqdm import tqdm
+from einops import rearrange, repeat
+from exit.utils import get_model, generate_src_mask
+class PatchUpSampling(nn.Module):
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.up_sampling = nn.Linear(dim, 4 * dim, bias=False)
+        self.norm = norm_layer(dim)
+    def forward(self, x):
+        """
+        x: B, F, C
+        """
+        x = self.norm(x)
+        x = self.up_sampling(x)
+        x0 = x[:, :, 0::4]
+        x1 = x[:, :, 1::4]
+        x2 = x[:, :, 2::4]
+        x3 = x[:, :, 3::4]
+        x = torch.cat([x0, x1, x2, x3], 1)
+        return x
+class Decoder_Transformer(nn.Module):
+    def __init__(self,
+                code_dim=1024,
+                embed_dim=512,
+                output_dim=263,
+                block_size=16,
+                num_layers=2,
+                n_head=8,
+                drop_out_rate=0.1,
+                fc_rate=4):
+        super().__init__()
+        self.joint_embed = nn.Linear(code_dim, embed_dim)
+        self.drop = nn.Dropout(drop_out_rate)
+        # transformer block
+        self.blocks = nn.Sequential(*[Block(embed_dim, block_size, n_head, drop_out_rate, fc_rate) for _ in range(num_layers)])
+        self.up_sample = PatchUpSampling(embed_dim)
+        self.pos_embed = pos_encoding.PositionEmbedding(block_size, embed_dim, 0.0, False)
+        self.head = nn.Sequential(nn.LayerNorm(embed_dim),
+                            nn.Linear(embed_dim, output_dim))
+        self.block_size = block_size
+        self.n_head = n_head
+        self.apply(self._init_weights)
+    def get_block_size(self):
+        return self.block_size
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self, token_embeddings):
+        # token_embeddings = self.tok_emb(idx)
+        # B, T = src_mask.shape
+        # src_mask = src_mask.view(B, 1, 1, T).repeat(1, self.n_head, T, 1)
+        token_embeddings = token_embeddings.permute(0, 2, 1)
+        token_embeddings = self.joint_embed(token_embeddings)
+        x = self.pos_embed(token_embeddings)
+        # block_step_len = int(len(self.blocks)/3)
+        # mask_temp = get_attn_mask(_range=3, _max=x.shape[1]).to(src_mask.device)
+        # eye = torch.eye(x.shape[1]).unsqueeze(0).unsqueeze(0).to(src_mask.device).bool()
+        # src_mask = src_mask*mask_temp + eye
+        for block in self.blocks:
+            x = block(x)
+        x = self.up_sample(x)
+        # mask_2 = mask_1.repeat(1, 1, 2, 2)
+        # for block in self.blocks[block_step_len:2*block_step_len]:
+        #     x = block(x, mask_2)
+        # x = self.up_sample(x)
+        # mask_3 = mask_2.repeat(1, 1, 2, 2)
+        # for block in self.blocks[2*block_step_len:]:
+        #     x = block(x, mask_3)
+        x = self.head(x).permute(0, 2, 1)
+        return x
+# https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py#L342C9-L343C33
+class PatchMerging(nn.Module):
+    def __init__(self, input_feats, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * input_feats, dim, bias=False)
+        self.norm = norm_layer(4 * input_feats)
+    def forward(self, x):
+        """
+        x: B, F, C
+        """
+        x0 = x[:, 0::4, :]  # B F/2 C
+        x1 = x[:, 1::4, :]
+        x2 = x[:, 2::4, :]  # B F/2 C
+        x3 = x[:, 3::4, :]
+        x = torch.cat([x0, x1, x2, x3], -1)  # B F/2 2*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class Encoder_Transformer(nn.Module):
+    def __init__(self,
+                input_feats=1024,
+                embed_dim=512,
+                output_dim=263,
+                block_size=16,
+                num_layers=2,
+                n_head=8,
+                drop_out_rate=0.1,
+                fc_rate=4):
+        super().__init__()
+        self.joint_embed = nn.Linear(input_feats, embed_dim)
+        self.drop = nn.Dropout(drop_out_rate)
+        # transformer block
+        self.blocks = nn.Sequential(*[Block(embed_dim, block_size, n_head, drop_out_rate, fc_rate) for _ in range(num_layers)])
+        # self.patch_merging1 = PatchMerging(input_feats, embed_dim)
+        # self.patch_merging2 = PatchMerging(embed_dim)
+        self.weighted_mean_norm = nn.LayerNorm(embed_dim)
+        self.weighted_mean = torch.nn.Conv1d(in_channels=block_size, out_channels=1, kernel_size=1)
+        self.pos_embed = pos_encoding.PositionEmbedding(block_size, embed_dim, 0.0, False)
+        self.head = nn.Sequential(nn.LayerNorm(embed_dim),
+                            nn.Linear(embed_dim, output_dim))
+        self.block_size = block_size
+        self.n_head = n_head
+        self.apply(self._init_weights)
+    def get_block_size(self):
+        return self.block_size
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self, joints):
+        # B, T = src_mask.shape
+        # src_mask = src_mask.view(B, 1, 1, T).repeat(1, self.n_head, T, 1)
+        joints = joints.permute(0,2,1)
+        # token_embeddings = self.joint_embed(joints)
+        block_step_len = int(len(self.blocks)/3)
+        x = self.joint_embed(joints)
+        token_len = int(x.shape[1]/self.block_size)
+        _original_shape = list(x.shape)
+        x = x.view(x.shape[0]*token_len, self.block_size, -1)
+        # mask_temp = get_attn_mask(_range=3, _max=x.shape[1]).to(src_mask.device)
+        # eye = torch.eye(x.shape[1]).unsqueeze(0).unsqueeze(0).to(src_mask.device).bool()
+        # src_mask = src_mask*mask_temp + eye
+        x = self.pos_embed(x)
+        for block in self.blocks:
+            x = block(x)
+        x = self.weighted_mean_norm(x)
+        x = self.weighted_mean(x)
+        _original_shape[1] = int(_original_shape[1] / self.block_size)
+        x = x.view(*_original_shape)
+        # for block in self.blocks[block_step_len:2*block_step_len]:
+        #     x = block(x)
+        # x = self.patch_merging2(x)
+        # for block in self.blocks[2*block_step_len:]:
+        #     x = block(x)
+        x = self.head(x).permute(0, 2, 1)
+        return x
+class Text2Motion_Transformer(nn.Module):
+    def __init__(self,
+                vqvae,
+                num_vq=1024,
+                embed_dim=512,
+                clip_dim=512,
+                block_size=16,
+                num_layers=2,
+                num_local_layer=0,
+                n_head=8,
+                drop_out_rate=0.1,
+                fc_rate=4):
+        super().__init__()
+        self.n_head = n_head
+        self.trans_base = CrossCondTransBase(vqvae, num_vq, embed_dim, clip_dim, block_size, num_layers, num_local_layer, n_head, drop_out_rate, fc_rate)
+        self.trans_head = CrossCondTransHead(num_vq, embed_dim, block_size, num_layers, n_head, drop_out_rate, fc_rate)
+        self.block_size = block_size
+        self.num_vq = num_vq
+    def get_block_size(self):
+        return self.block_size
+    def forward(self, *args, type='forward', **kwargs):
+        '''type=[forward, sample]'''
+        if type=='forward':
+            return self.forward_function(*args, **kwargs)
+        elif type=='sample':
+            return self.sample(*args, **kwargs)
+        elif type=='inpaint':
+            return self.inpaint(*args, **kwargs)
+        else:
+            raise ValueError(f'Unknown "{type}" type')
+    def get_attn_mask(self, src_mask, att_txt=None, txt_mark=None):
+        if att_txt is None:
+            att_txt = torch.tensor([[True]]*src_mask.shape[0]).to(src_mask.device)
+        src_mask = torch.cat([att_txt, src_mask],  dim=1)
+        B, T = src_mask.shape
+        src_mask = src_mask.view(B, 1, 1, T).repeat(1, self.n_head, T, 1)
+        if txt_mark is not None:
+            att_txt_txt = torch.tensor([[True]]*txt_mark.shape[0]).to(txt_mark.device)
+            txt_mark = torch.cat([att_txt_txt, txt_mark],  dim=1)
+            src_mask[:, :, :, 0] = txt_mark.view(B, 1, T).repeat(1, self.n_head, 1)
+        return src_mask
+    def forward_function(self, idx_upper, idx_lower, clip_feature, src_mask=None, att_txt=None, txt_mark=None, word_emb=None):
+        # MLD:
+        # if att_txt is None:
+        #     att_txt = torch.tensor([[True]]*src_mask.shape[0]).to(src_mask.device)
+        # src_mask = torch.cat([att_txt, src_mask],  dim=1)
+        # logits = self.skip_trans(idxs, clip_feature, src_mask)
+        # T2M-BD
+        if src_mask is not None:
+            src_mask = self.get_attn_mask(src_mask, att_txt, txt_mark)
+        feat = self.trans_base(idx_upper, idx_lower, clip_feature, src_mask, word_emb)
+        logits = self.trans_head(feat, src_mask)
+        return logits
+    def sample(self, clip_feature, idx_lower, word_emb, m_length=None, if_test=False, rand_pos=False, CFG=-1):
+        max_steps = 20
+        max_length = 49
+        batch_size = clip_feature.shape[0]
+        mask_id = self.num_vq + 2
+        pad_id = self.num_vq + 1
+        end_id = self.num_vq
+        shape = (batch_size, self.block_size - 1)
+        topk_filter_thres = .9
+        starting_temperature = 1.0
+        scores = torch.ones(shape, dtype = torch.float32, device = clip_feature.device)
+        m_tokens_len = torch.ceil((m_length)/4)
+        src_token_mask = generate_src_mask(self.block_size-1, m_tokens_len+1)
+        src_token_mask_noend = generate_src_mask(self.block_size-1, m_tokens_len)
+        ids = torch.full(shape, mask_id, dtype = torch.long, device = clip_feature.device)
+        # [TODO] confirm that these 2 lines are not neccessary (repeated below and maybe don't need them at all)
+        ids[~src_token_mask] = pad_id # [INFO] replace with pad id
+        ids.scatter_(-1, m_tokens_len[..., None].long(), end_id) # [INFO] replace with end id
+        ### PlayGround ####
+        # score high = mask
+        # m_tokens_len = torch.ceil((m_length)/4)
+        # src_token_mask = generate_src_mask(self.block_size-1, m_tokens_len+1)
+        # # mock
+        # timestep = torch.tensor(.5)
+        # rand_mask_prob = cosine_schedule(timestep)
+        # scores = torch.arange(self.block_size - 1).repeat(batch_size, 1).cuda()
+        # scores[1] = torch.flip(torch.arange(self.block_size - 1), dims=(0,))
+        # # iteration
+        # num_token_masked = (rand_mask_prob * m_tokens_len).int().clip(min=1)
+        # scores[~src_token_mask] = -1e5
+        # masked_indices = scores.argsort(dim=-1, descending=True) # This is flipped the order. The highest score is the first in order.
+        # masked_indices = masked_indices < num_token_masked.unsqueeze(-1) # So it can filter out by "< num_token_masked". We want to filter the high score as a mask
+        # ids[masked_indices] = mask_id
+        #########################
+        temp = []
+        sample_max_steps = torch.round(max_steps/max_length*m_tokens_len) + 1e-8
+        for step in range(max_steps):
+            timestep = torch.clip(step/(sample_max_steps), max=1)
+            rand_mask_prob = cosine_schedule(timestep) # timestep #
+            num_token_masked = (rand_mask_prob * m_tokens_len).long().clip(min=1)
+            # [INFO] rm no motion frames
+            scores[~src_token_mask_noend] = 0
+            scores = scores/scores.sum(-1)[:, None] # normalize only unmasked token
+            # if rand_pos:
+            #     sorted_score_indices = scores.multinomial(scores.shape[-1], replacement=False) # stocastic
+            # else:
+            sorted, sorted_score_indices = scores.sort(descending=True) # deterministic
+            ids[~src_token_mask] = pad_id # [INFO] replace with pad id
+            ids.scatter_(-1, m_tokens_len[..., None].long(), end_id) # [INFO] replace with end id
+            ## [INFO] Replace "mask_id" to "ids" that have highest "num_token_masked" "scores"
+            select_masked_indices = generate_src_mask(sorted_score_indices.shape[1], num_token_masked)
+            # [INFO] repeat last_id to make it scatter_ the existing last ids.
+            last_index = sorted_score_indices.gather(-1, num_token_masked.unsqueeze(-1)-1)
+            sorted_score_indices = sorted_score_indices * select_masked_indices + (last_index*~select_masked_indices)
+            ids.scatter_(-1, sorted_score_indices, mask_id)
+            # if torch.isclose(timestep, torch.tensor(0.7647), atol=.01):
+            #     print('masked_indices:', ids[0], src_token_mask[0])
+            if CFG!=-1:
+                # print('ids:', ids.shape, clip_feature.shape, src_token_mask.shape)
+                _ids = ids.repeat(2,1)
+                _clip_feature = clip_feature.repeat(2,1)
+                _src_token_mask = src_token_mask.repeat(2,1)
+                att_txt = torch.cat( (torch.ones((batch_size,1), dtype=torch.bool),
+                                      torch.zeros((batch_size,1), dtype=torch.bool) )).to(_ids.device)
+                logits = self.forward(_ids, idx_lower, _clip_feature, _src_token_mask, att_txt)[:,1:]
+                logits_textcond = logits[:batch_size]
+                logits_uncond = logits[batch_size:]
+                # logits = (1-CFG)*logits_textcond + CFG*logits_uncond
+                logits = (1+CFG)*logits_textcond - CFG*logits_uncond
+            else:
+                logits = self.forward(ids, idx_lower, clip_feature, src_token_mask, word_emb=word_emb)[:,1:]
+            filtered_logits = logits #top_p(logits, .5) # #top_k(logits, topk_filter_thres)
+            if rand_pos:
+                temperature = 1 #starting_temperature * (steps_until_x0 / timesteps) # temperature is annealed
+            else:
+                temperature = 0 #starting_temperature * (steps_until_x0 / timesteps) # temperature is annealed
+            # [INFO] if temperature==0: is equal to argmax (filtered_logits.argmax(dim = -1))
+            # pred_ids = filtered_logits.argmax(dim = -1)
+            pred_ids = gumbel_sample(filtered_logits, temperature = temperature, dim = -1)
+            is_mask = ids == mask_id
+            temp.append(is_mask[:1])
+            # mid = is_mask[0][:m_tokens_len[0].int()]
+            # mid = mid.nonzero(as_tuple=True)[0]
+            # print(is_mask[0].sum(), m_tokens_len[0])
+            ids = torch.where(
+                        is_mask,
+                        pred_ids,
+                        ids
+                    )
+            # if timestep == 1.:
+            #     print(probs_without_temperature.shape)
+            probs_without_temperature = logits.softmax(dim = -1)
+            scores = 1 - probs_without_temperature.gather(-1, pred_ids[..., None])
+            scores = rearrange(scores, '... 1 -> ...')
+            scores = scores.masked_fill(~is_mask, 0)
+        if if_test:
+            return ids, temp
+        return ids
+    def inpaint(self, first_tokens, last_tokens, clip_feature=None, inpaint_len=2, rand_pos=False):
+        # support only one sample
+        assert first_tokens.shape[0] == 1
+        assert last_tokens.shape[0] == 1
+        max_steps = 20
+        max_length = 49
+        batch_size = first_tokens.shape[0]
+        mask_id = self.num_vq + 2
+        pad_id = self.num_vq + 1
+        end_id = self.num_vq
+        shape = (batch_size, self.block_size - 1)
+        scores = torch.ones(shape, dtype = torch.float32, device = first_tokens.device)
+        # force add first / last tokens
+        first_partition_pos_idx = first_tokens.shape[1]
+        second_partition_pos_idx = first_partition_pos_idx + inpaint_len
+        end_pos_idx = second_partition_pos_idx + last_tokens.shape[1]
+        m_tokens_len = torch.ones(batch_size, device = first_tokens.device)*end_pos_idx
+        src_token_mask = generate_src_mask(self.block_size-1, m_tokens_len+1)
+        src_token_mask_noend = generate_src_mask(self.block_size-1, m_tokens_len)
+        ids = torch.full(shape, mask_id, dtype = torch.long, device = first_tokens.device)
+        ids[:, :first_partition_pos_idx] = first_tokens
+        ids[:, second_partition_pos_idx:end_pos_idx] = last_tokens
+        src_token_mask_noend[:, :first_partition_pos_idx] = False
+        src_token_mask_noend[:, second_partition_pos_idx:end_pos_idx] = False
+        # [TODO] confirm that these 2 lines are not neccessary (repeated below and maybe don't need them at all)
+        ids[~src_token_mask] = pad_id # [INFO] replace with pad id
+        ids.scatter_(-1, m_tokens_len[..., None].long(), end_id) # [INFO] replace with end id
+        temp = []
+        sample_max_steps = torch.round(max_steps/max_length*m_tokens_len) + 1e-8
+        if clip_feature is None:
+            clip_feature = torch.zeros(1, 512).to(first_tokens.device)
+            att_txt = torch.zeros((batch_size,1), dtype=torch.bool, device = first_tokens.device)
+        else:
+            att_txt = torch.ones((batch_size,1), dtype=torch.bool, device = first_tokens.device)
+        for step in range(max_steps):
+            timestep = torch.clip(step/(sample_max_steps), max=1)
+            rand_mask_prob = cosine_schedule(timestep) # timestep #
+            num_token_masked = (rand_mask_prob * m_tokens_len).long().clip(min=1)
+            # [INFO] rm no motion frames
+            scores[~src_token_mask_noend] = 0
+            # [INFO] rm begin and end frames
+            scores[:, :first_partition_pos_idx] = 0
+            scores[:, second_partition_pos_idx:end_pos_idx] = 0
+            scores = scores/scores.sum(-1)[:, None] # normalize only unmasked token
+            sorted, sorted_score_indices = scores.sort(descending=True) # deterministic
+            ids[~src_token_mask] = pad_id # [INFO] replace with pad id
+            ids.scatter_(-1, m_tokens_len[..., None].long(), end_id) # [INFO] replace with end id
+            ## [INFO] Replace "mask_id" to "ids" that have highest "num_token_masked" "scores"
+            select_masked_indices = generate_src_mask(sorted_score_indices.shape[1], num_token_masked)
+            # [INFO] repeat last_id to make it scatter_ the existing last ids.
+            last_index = sorted_score_indices.gather(-1, num_token_masked.unsqueeze(-1)-1)
+            sorted_score_indices = sorted_score_indices * select_masked_indices + (last_index*~select_masked_indices)
+            ids.scatter_(-1, sorted_score_indices, mask_id)
+            # [TODO] force replace begin/end tokens b/c the num mask will be more than actual inpainting frames
+            ids[:, :first_partition_pos_idx] = first_tokens
+            ids[:, second_partition_pos_idx:end_pos_idx] = last_tokens
+            logits = self.forward(ids, clip_feature, src_token_mask, att_txt)[:,1:]
+            filtered_logits = logits #top_k(logits, topk_filter_thres)
+            if rand_pos:
+                temperature = 1 #starting_temperature * (steps_until_x0 / timesteps) # temperature is annealed
+            else:
+                temperature = 0 #starting_temperature * (steps_until_x0 / timesteps) # temperature is annealed
+            # [INFO] if temperature==0: is equal to argmax (filtered_logits.argmax(dim = -1))
+            # pred_ids = filtered_logits.argmax(dim = -1)
+            pred_ids = gumbel_sample(filtered_logits, temperature = temperature, dim = -1)
+            is_mask = ids == mask_id
+            temp.append(is_mask[:1])
+            ids = torch.where(
+                        is_mask,
+                        pred_ids,
+                        ids
+                    )
+            probs_without_temperature = logits.softmax(dim = -1)
+            scores = 1 - probs_without_temperature.gather(-1, pred_ids[..., None])
+            scores = rearrange(scores, '... 1 -> ...')
+            scores = scores.masked_fill(~is_mask, 0)
+        return ids
+class Attention(nn.Module):
+    def __init__(self, embed_dim=512, block_size=16, n_head=8, drop_out_rate=0.1):
+        super().__init__()
+        assert embed_dim % 8 == 0
+        # key, query, value projections for all heads
+        self.key = nn.Linear(embed_dim, embed_dim)
+        self.query = nn.Linear(embed_dim, embed_dim)
+        self.value = nn.Linear(embed_dim, embed_dim)
+        self.attn_drop = nn.Dropout(drop_out_rate)
+        self.resid_drop = nn.Dropout(drop_out_rate)
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.n_head = n_head
+    def forward(self, x, src_mask):
+        B, T, C = x.size()
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        if src_mask is not None:
+            att[~src_mask] = float('-inf')
+        att = F.softmax(att, dim=-1)
+        att = self.attn_drop(att)
+        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+        # output projection
+        y = self.resid_drop(self.proj(y))
+        return y
+class Block(nn.Module):
+    def __init__(self, embed_dim=512, block_size=16, n_head=8, drop_out_rate=0.1, fc_rate=4):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(embed_dim)
+        self.ln2 = nn.LayerNorm(embed_dim)
+        self.attn = Attention(embed_dim, block_size, n_head, drop_out_rate)
+        self.mlp = nn.Sequential(
+            nn.Linear(embed_dim, fc_rate * embed_dim),
+            nn.GELU(),
+            nn.Linear(fc_rate * embed_dim, embed_dim),
+            nn.Dropout(drop_out_rate),
+        )
+    def forward(self, x, src_mask=None):
+        x = x + self.attn(self.ln1(x), src_mask)
+        x = x + self.mlp(self.ln2(x))
+        return x
+from models.t2m_trans import Block_crossatt
+class CrossCondTransBase(nn.Module):
+    def __init__(self,
+                vqvae,
+                num_vq=1024,
+                embed_dim=512,
+                clip_dim=512,
+                block_size=16,
+                num_layers=2,
+                num_local_layer = 1,
+                n_head=8,
+                drop_out_rate=0.1,
+                fc_rate=4):
+        super().__init__()
+        self.vqvae = vqvae
+        # self.tok_emb = nn.Embedding(num_vq + 3, embed_dim).requires_grad_(False)
+        self.learn_tok_emb = nn.Embedding(3, int(self.vqvae.vqvae.code_dim/2))# [INFO] 3 = [end_id, blank_id, mask_id]
+        self.to_emb = nn.Linear(self.vqvae.vqvae.code_dim, embed_dim)
+        self.cond_emb = nn.Linear(clip_dim, embed_dim)
+        self.pos_embedding = nn.Embedding(block_size, embed_dim)
+        self.drop = nn.Dropout(drop_out_rate)
+        # transformer block
+        self.blocks = nn.Sequential(*[Block(embed_dim, block_size, n_head, drop_out_rate, fc_rate) for _ in range(num_layers-num_local_layer)])
+        self.pos_embed = pos_encoding.PositionEmbedding(block_size, embed_dim, 0.0, False)
+        self.num_local_layer = num_local_layer
+        if num_local_layer > 0:
+            self.word_emb = nn.Linear(clip_dim, embed_dim)
+            self.cross_att = nn.Sequential(*[Block_crossatt(embed_dim, block_size, n_head, drop_out_rate, fc_rate) for _ in range(num_local_layer)])
+        self.block_size = block_size
+        self.apply(self._init_weights)
+    def get_block_size(self):
+        return self.block_size
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self, idx_upper, idx_lower, clip_feature, src_mask, word_emb):
+        if len(idx_upper) == 0:
+            token_embeddings = self.cond_emb(clip_feature).unsqueeze(1)
+        else:
+            b, t = idx_upper.size()
+            assert t <= self.block_size, "Cannot forward, model block size is exhausted."
+            # forward the Trans model
+            learn_idx_upper = idx_upper>=self.vqvae.vqvae.num_code
+            learn_idx_lower = idx_lower>=self.vqvae.vqvae.num_code
+            code_dim = self.vqvae.vqvae.code_dim
+            token_embeddings = torch.empty((*idx_upper.shape, code_dim), device=idx_upper.device)
+            token_embeddings[..., :int(code_dim/2)][~learn_idx_upper] = self.vqvae.vqvae.quantizer_upper.dequantize(idx_upper[~learn_idx_upper]).requires_grad_(False)
+            token_embeddings[..., :int(code_dim/2)][learn_idx_upper] = self.learn_tok_emb(idx_upper[learn_idx_upper]-self.vqvae.vqvae.num_code)
+            token_embeddings[..., int(code_dim/2):][~learn_idx_lower] = self.vqvae.vqvae.quantizer_lower.dequantize(idx_lower[~learn_idx_lower]).requires_grad_(False)
+            token_embeddings[..., int(code_dim/2):][learn_idx_lower] = self.learn_tok_emb(idx_lower[learn_idx_lower]-self.vqvae.vqvae.num_code)
+            token_embeddings = self.to_emb(token_embeddings)
+            if self.num_local_layer > 0:
+                word_emb = self.word_emb(word_emb)
+                token_embeddings = self.pos_embed(token_embeddings)
+                for module in self.cross_att:
+                    token_embeddings = module(token_embeddings, word_emb)
+            token_embeddings = torch.cat([self.cond_emb(clip_feature).unsqueeze(1), token_embeddings], dim=1)
+        x = self.pos_embed(token_embeddings)
+        for block in self.blocks:
+            x = block(x, src_mask)
+        return x
+class CrossCondTransHead(nn.Module):
+    def __init__(self,
+                num_vq=1024,
+                embed_dim=512,
+                block_size=16,
+                num_layers=2,
+                n_head=8,
+                drop_out_rate=0.1,
+                fc_rate=4):
+        super().__init__()
+        self.blocks = nn.Sequential(*[Block(embed_dim, block_size, n_head, drop_out_rate, fc_rate) for _ in range(num_layers)])
+        self.ln_f = nn.LayerNorm(embed_dim)
+        self.head = nn.Linear(embed_dim, num_vq, bias=False)
+        self.block_size = block_size
+        self.apply(self._init_weights)
+    def get_block_size(self):
+        return self.block_size
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self, x, src_mask):
+        for block in self.blocks:
+            x = block(x, src_mask)
+        x = self.ln_f(x)
+        logits = self.head(x)
+        return logits

models/vqvae.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import torch.nn as nn
+from models.encdec import Encoder, Decoder
+from models.quantize_cnn import QuantizeEMAReset, Quantizer, QuantizeEMA, QuantizeReset
+from models.t2m_trans import Decoder_Transformer, Encoder_Transformer
+from exit.utils import generate_src_mask
+class VQVAE_251(nn.Module):
+    def __init__(self,
+                 args,
+                 nb_code=1024,
+                 code_dim=512,
+                 output_emb_width=512,
+                 down_t=3,
+                 stride_t=2,
+                 width=512,
+                 depth=3,
+                 dilation_growth_rate=3,
+                 activation='relu',
+                 norm=None):
+        super().__init__()
+        self.code_dim = code_dim
+        self.num_code = nb_code
+        self.quant = args.quantizer
+        output_dim = 251 if args.dataname == 'kit' else 263
+        self.encoder = Encoder(output_dim, output_emb_width, down_t, stride_t, width, depth, dilation_growth_rate, activation=activation, norm=norm)
+        # Transformer Encoder
+        # self.encoder = Encoder_Transformer(
+        #     input_feats=output_dim,
+        #     embed_dim=512, # 1024
+        #     output_dim=512,
+        #     block_size=4,
+        #     num_layers=6,
+        #     n_head=16
+        # )
+         # Transformer Encoder 4 frames
+        # from exit.motiontransformer import MotionTransformerEncoder
+        # in_feature = 251 if args.dataname == 'kit' else 263
+        # self.encoder2 = MotionTransformerEncoder(in_feature, args.code_dim, num_frames=4, num_layers=2)
+        self.decoder = Decoder(output_dim, output_emb_width, down_t, stride_t, width, depth, dilation_growth_rate, activation=activation, norm=norm)
+        # self.decoder = Decoder_Transformer(
+        #     code_dim=512,
+        #     embed_dim=512, # 1024
+        #     output_dim=output_dim,
+        #     block_size=49,
+        #     num_layers=6,
+        #     n_head=8
+        # )
+        if args.quantizer == "ema_reset":
+            self.quantizer = QuantizeEMAReset(nb_code, code_dim, args)
+        elif args.quantizer == "orig":
+            self.quantizer = Quantizer(nb_code, code_dim, 1.0)
+        elif args.quantizer == "ema":
+            self.quantizer = QuantizeEMA(nb_code, code_dim, args)
+        elif args.quantizer == "reset":
+            self.quantizer = QuantizeReset(nb_code, code_dim, args)
+    def preprocess(self, x):
+        # (bs, T, Jx3) -> (bs, Jx3, T)
+        x = x.permute(0,2,1).float()
+        return x
+    def postprocess(self, x):
+        # (bs, Jx3, T) ->  (bs, T, Jx3)
+        x = x.permute(0,2,1)
+        return x
+    def encode(self, x):
+        N, T, _ = x.shape
+        x_in = self.preprocess(x)
+        x_encoder = self.encoder(x_in)
+        x_encoder = self.postprocess(x_encoder)
+        x_encoder = x_encoder.contiguous().view(-1, x_encoder.shape[-1])  # (NT, C)
+        code_idx = self.quantizer.quantize(x_encoder)
+        code_idx = code_idx.view(N, -1)
+        return code_idx
+    def forward(self, x):
+        x_in = self.preprocess(x)
+        # Encode
+        # _x_in = x_in.reshape( int(x_in.shape[0]*4), x_in.shape[1], 16)
+        # x_encoder = self.encoder(_x_in)
+        # x_encoder = x_encoder.reshape(x_in.shape[0], -1, int(x_in.shape[2]/4))
+        # [Transformer Encoder]
+        # _x_in = x_in.reshape( int(x_in.shape[0]*x_in.shape[2]/4), x_in.shape[1], 4)
+        # _x_in = _x_in.permute(0,2,1)
+        # x_encoder = self.encoder2(_x_in)
+        # x_encoder = x_encoder.permute(0,2,1)
+        # x_encoder = x_encoder.reshape(x_in.shape[0], -1, int(x_in.shape[2]/4))
+        x_encoder = self.encoder(x_in)
+        ## quantization
+        x_quantized, loss, perplexity  = self.quantizer(x_encoder)
+        ## decoder
+        x_decoder = self.decoder(x_quantized)
+        x_out = self.postprocess(x_decoder)
+        return x_out, loss, perplexity
+    def forward_decoder(self, x):
+        # x = x.clone()
+        # pad_mask = x >= self.code_dim
+        # x[pad_mask] = 0
+        x_d = self.quantizer.dequantize(x)
+        x_d = x_d.permute(0, 2, 1).contiguous()
+        # pad_mask = pad_mask.unsqueeze(1)
+        # x_d = x_d * ~pad_mask
+        # decoder
+        x_decoder = self.decoder(x_d)
+        x_out = self.postprocess(x_decoder)
+        return x_out
+class HumanVQVAE(nn.Module):
+    def __init__(self,
+                 args,
+                 nb_code=512,
+                 code_dim=512,
+                 output_emb_width=512,
+                 down_t=3,
+                 stride_t=2,
+                 width=512,
+                 depth=3,
+                 dilation_growth_rate=3,
+                 activation='relu',
+                 norm=None):
+        super().__init__()
+        self.nb_joints = 21 if args.dataname == 'kit' else 22
+        self.vqvae = VQVAE_251(args, nb_code, code_dim, code_dim, down_t, stride_t, width, depth, dilation_growth_rate, activation=activation, norm=norm)
+    def forward(self, x, type='full'):
+        '''type=[full, encode, decode]'''
+        if type=='full':
+            x_out, loss, perplexity = self.vqvae(x)
+            return x_out, loss, perplexity
+        elif type=='encode':
+            b, t, c = x.size()
+            quants = self.vqvae.encode(x) # (N, T)
+            return quants
+        elif type=='decode':
+            x_out = self.vqvae.forward_decoder(x)
+            return x_out
+        else:
+            raise ValueError(f'Unknown "{type}" type')

models/vqvae_sep.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import torch.nn as nn
+from models.encdec import Encoder, Decoder
+from models.quantize_cnn import QuantizeEMAReset, Quantizer, QuantizeEMA, QuantizeReset
+from models.t2m_trans import Decoder_Transformer, Encoder_Transformer
+from exit.utils import generate_src_mask
+import torch
+from utils.humanml_utils import HML_UPPER_BODY_MASK, HML_LOWER_BODY_MASK, UPPER_JOINT_Y_MASK
+class VQVAE_SEP(nn.Module):
+    def __init__(self,
+                 args,
+                 nb_code=512,
+                 code_dim=512,
+                 output_emb_width=512,
+                 down_t=3,
+                 stride_t=2,
+                 width=512,
+                 depth=3,
+                 dilation_growth_rate=3,
+                 activation='relu',
+                 norm=None,
+                 moment=None,
+                 sep_decoder=False):
+        super().__init__()
+        if args.dataname == 'kit':
+            self.nb_joints = 21
+            output_dim = 251
+            upper_dim = 120
+            lower_dim = 131
+        else:
+            self.nb_joints = 22
+            output_dim = 263
+            upper_dim = 156
+            lower_dim = 107
+        self.code_dim = code_dim
+        if moment is not None:
+            self.moment = moment
+            self.register_buffer('mean_upper', torch.tensor([0.1216, 0.2488, 0.2967, 0.5027, 0.4053, 0.4100, 0.5703, 0.4030, 0.4078, 0.1994, 0.1992, 0.0661, 0.0639], dtype=torch.float32))
+            self.register_buffer('std_upper', torch.tensor([0.0164, 0.0412, 0.0523, 0.0864, 0.0695, 0.0703, 0.1108, 0.0853, 0.0847, 0.1289, 0.1291, 0.2463, 0.2484], dtype=torch.float32))
+        # self.quantizer = QuantizeEMAReset(nb_code, code_dim, args)
+        # self.encoder = Encoder(output_dim, output_emb_width, down_t, stride_t, width, depth, dilation_growth_rate, activation=activation, norm=norm)
+        self.sep_decoder = sep_decoder
+        if self.sep_decoder:
+            self.decoder_upper = Decoder(upper_dim, int(code_dim/2), down_t, stride_t, width, depth, dilation_growth_rate, activation=activation, norm=norm)
+            self.decoder_lower = Decoder(lower_dim, int(code_dim/2), down_t, stride_t, width, depth, dilation_growth_rate, activation=activation, norm=norm)
+        else:
+            self.decoder = Decoder(output_dim, code_dim, down_t, stride_t, width, depth, dilation_growth_rate, activation=activation, norm=norm)
+        self.num_code = nb_code
+        self.encoder_upper = Encoder(upper_dim, int(code_dim/2), down_t, stride_t, width, depth, dilation_growth_rate, activation=activation, norm=norm)
+        self.encoder_lower = Encoder(lower_dim, int(code_dim/2), down_t, stride_t, width, depth, dilation_growth_rate, activation=activation, norm=norm)
+        self.quantizer_upper = QuantizeEMAReset(nb_code, int(code_dim/2), args)
+        self.quantizer_lower = QuantizeEMAReset(nb_code, int(code_dim/2), args)
+    def rand_emb_idx(self, x_quantized, quantizer, idx_noise):
+        # x_quantized = x_quantized.detach()
+        x_quantized = x_quantized.permute(0,2,1)
+        mask = torch.bernoulli(idx_noise * torch.ones((*x_quantized.shape[:2], 1),
+                                                device=x_quantized.device))
+        r_indices = torch.randint(int(self.num_code/2), x_quantized.shape[:2], device=x_quantized.device)
+        r_emb = quantizer.dequantize(r_indices)
+        x_quantized = mask * r_emb + (1-mask) * x_quantized
+        x_quantized = x_quantized.permute(0,2,1)
+        return x_quantized
+    def normalize(self, data):
+        return (data - self.moment['mean']) / self.moment['std']
+    def denormalize(self, data):
+        return data * self.moment['std'] + self.moment['mean']
+    def normalize_upper(self, data):
+        return (data - self.mean_upper) / self.std_upper
+    def denormalize_upper(self, data):
+        return data * self.std_upper + self.mean_upper
+    def shift_upper_down(self, data):
+        data = data.clone()
+        data = self.denormalize(data)
+        shift_y = data[..., 3:4].clone()
+        data[..., UPPER_JOINT_Y_MASK] -= shift_y
+        _data = data.clone()
+        data = self.normalize(data)
+        data[..., UPPER_JOINT_Y_MASK] = self.normalize_upper(_data[..., UPPER_JOINT_Y_MASK])
+        return data
+    def shift_upper_up(self, data):
+        _data = data.clone()
+        data = self.denormalize(data)
+        data[..., UPPER_JOINT_Y_MASK] = self.denormalize_upper(_data[..., UPPER_JOINT_Y_MASK])
+        shift_y = data[..., 3:4].clone()
+        data[..., UPPER_JOINT_Y_MASK] += shift_y
+        data = self.normalize(data)
+        return data
+    def forward(self, x, *args, type='full', **kwargs):
+        '''type=[full, encode, decode]'''
+        if type=='full':
+            x = x.float()
+            x = self.shift_upper_down(x)
+            upper_emb = x[..., HML_UPPER_BODY_MASK]
+            lower_emb = x[..., HML_LOWER_BODY_MASK]
+            upper_emb = self.preprocess(upper_emb)
+            upper_emb = self.encoder_upper(upper_emb)
+            upper_emb, loss_upper, perplexity = self.quantizer_upper(upper_emb)
+            lower_emb = self.preprocess(lower_emb)
+            lower_emb = self.encoder_lower(lower_emb)
+            lower_emb, loss_lower, perplexity = self.quantizer_lower(lower_emb)
+            loss = loss_upper + loss_lower
+            if 'idx_noise' in kwargs and kwargs['idx_noise'] > 0:
+                upper_emb = self.rand_emb_idx(upper_emb, self.quantizer_upper, kwargs['idx_noise'])
+                lower_emb = self.rand_emb_idx(lower_emb, self.quantizer_lower, kwargs['idx_noise'])
+            # x_in = self.preprocess(x)
+            # x_encoder = self.encoder(x_in)
+            # ## quantization
+            # x_quantized, loss, perplexity  = self.quantizer(x_encoder)
+            ## decoder
+            if self.sep_decoder:
+                x_decoder_upper = self.decoder_upper(upper_emb)
+                x_decoder_upper = self.postprocess(x_decoder_upper)
+                x_decoder_lower = self.decoder_lower(lower_emb)
+                x_decoder_lower = self.postprocess(x_decoder_lower)
+                x_out = merge_upper_lower(x_decoder_upper, x_decoder_lower)
+                x_out = self.shift_upper_up(x_out)
+            else:
+                x_quantized = torch.cat([upper_emb, lower_emb], dim=1)
+                x_decoder = self.decoder(x_quantized)
+                x_out = self.postprocess(x_decoder)
+            return x_out, loss, perplexity
+        elif type=='encode':
+            N, T, _ = x.shape
+            x = self.shift_upper_down(x)
+            upper_emb = x[..., HML_UPPER_BODY_MASK]
+            upper_emb = self.preprocess(upper_emb)
+            upper_emb = self.encoder_upper(upper_emb)
+            upper_emb = self.postprocess(upper_emb)
+            upper_emb = upper_emb.reshape(-1, upper_emb.shape[-1])
+            upper_code_idx = self.quantizer_upper.quantize(upper_emb)
+            upper_code_idx = upper_code_idx.view(N, -1)
+            lower_emb = x[..., HML_LOWER_BODY_MASK]
+            lower_emb = self.preprocess(lower_emb)
+            lower_emb = self.encoder_lower(lower_emb)
+            lower_emb = self.postprocess(lower_emb)
+            lower_emb = lower_emb.reshape(-1, lower_emb.shape[-1])
+            lower_code_idx = self.quantizer_lower.quantize(lower_emb)
+            lower_code_idx = lower_code_idx.view(N, -1)
+            code_idx = torch.cat([upper_code_idx.unsqueeze(-1), lower_code_idx.unsqueeze(-1)], dim=-1)
+            return code_idx
+        elif type=='decode':
+            if self.sep_decoder:
+                x_d_upper = self.quantizer_upper.dequantize(x[..., 0])
+                x_d_upper = x_d_upper.permute(0, 2, 1).contiguous()
+                x_d_upper = self.decoder_upper(x_d_upper)
+                x_d_upper = self.postprocess(x_d_upper)
+                x_d_lower = self.quantizer_lower.dequantize(x[..., 1])
+                x_d_lower = x_d_lower.permute(0, 2, 1).contiguous()
+                x_d_lower = self.decoder_lower(x_d_lower)
+                x_d_lower = self.postprocess(x_d_lower)
+                x_out = merge_upper_lower(x_d_upper, x_d_lower)
+                x_out = self.shift_upper_up(x_out)
+                return x_out
+            else:
+                x_d_upper = self.quantizer_upper.dequantize(x[..., 0])
+                x_d_lower = self.quantizer_lower.dequantize(x[..., 1])
+                x_d = torch.cat([x_d_upper, x_d_lower], dim=-1)
+                x_d = x_d.permute(0, 2, 1).contiguous()
+                x_decoder = self.decoder(x_d)
+                x_out = self.postprocess(x_decoder)
+                return x_out
+    def preprocess(self, x):
+        # (bs, T, Jx3) -> (bs, Jx3, T)
+        x = x.permute(0,2,1).float()
+        return x
+    def postprocess(self, x):
+        # (bs, Jx3, T) ->  (bs, T, Jx3)
+        x = x.permute(0,2,1)
+        return x
+def merge_upper_lower(upper_emb, lower_emb):
+    motion = torch.empty(*upper_emb.shape[:2], 263).to(upper_emb.device)
+    motion[..., HML_UPPER_BODY_MASK] = upper_emb
+    motion[..., HML_LOWER_BODY_MASK] = lower_emb
+    return motion
+def upper_lower_sep(motion, joints_num):
+    # root
+    _root = motion[..., :4] # root
+    # position
+    start_indx = 1 + 2 + 1
+    end_indx = start_indx + (joints_num - 1) * 3
+    positions = motion[..., start_indx:end_indx]
+    positions = positions.view(*motion.shape[:2], (joints_num - 1), 3)
+    # 6drot
+    start_indx = end_indx
+    end_indx = start_indx + (joints_num - 1) * 6
+    _6d_rot = motion[..., start_indx:end_indx]
+    _6d_rot = _6d_rot.view(*motion.shape[:2], (joints_num - 1), 6)
+    # joint_velo
+    start_indx = end_indx
+    end_indx = start_indx + joints_num * 3
+    joint_velo = motion[..., start_indx:end_indx]
+    joint_velo = joint_velo.view(*motion.shape[:2], joints_num, 3)
+    # foot_contact
+    foot_contact = motion[..., end_indx:]
+    ################################################################################################
+    #### Lower Body
+    if joints_num == 22:
+        lower_body = torch.tensor([0,1,2,4,5,7,8,10,11])
+    else:
+        lower_body = torch.tensor([0, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20])
+    lower_body_exclude_root = lower_body[1:] - 1
+    LOW_positions = positions[:,:, lower_body_exclude_root].view(*motion.shape[:2], -1)
+    LOW_6d_rot = _6d_rot[:,:, lower_body_exclude_root].view(*motion.shape[:2], -1)
+    LOW_joint_velo = joint_velo[:,:, lower_body].view(*motion.shape[:2], -1)
+    lower_emb = torch.cat([_root, LOW_positions, LOW_6d_rot, LOW_joint_velo, foot_contact], dim=-1)
+    #### Upper Body
+    if joints_num == 22:
+        upper_body = torch.tensor([3,6,9,12,13,14,15,16,17,18,19,20,21])
+    else:
+        upper_body = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    upper_body_exclude_root = upper_body - 1
+    UP_positions = positions[:,:, upper_body_exclude_root].view(*motion.shape[:2], -1)
+    UP_6d_rot = _6d_rot[:,:, upper_body_exclude_root].view(*motion.shape[:2], -1)
+    UP_joint_velo = joint_velo[:,:, upper_body].view(*motion.shape[:2], -1)
+    upper_emb = torch.cat([UP_positions, UP_6d_rot, UP_joint_velo], dim=-1)
+    return upper_emb, lower_emb

options/get_eval_option.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from argparse import Namespace
+import re
+from os.path import join as pjoin
+def is_float(numStr):
+    flag = False
+    numStr = str(numStr).strip().lstrip('-').lstrip('+')
+    try:
+        reg = re.compile(r'^[-+]?[0-9]+\.[0-9]+$')
+        res = reg.match(str(numStr))
+        if res:
+            flag = True
+    except Exception as ex:
+        print("is_float() - error: " + str(ex))
+    return flag
+def is_number(numStr):
+    flag = False
+    numStr = str(numStr).strip().lstrip('-').lstrip('+')
+    if str(numStr).isdigit():
+        flag = True
+    return flag
+def get_opt(opt_path, device):
+    opt = Namespace()
+    opt_dict = vars(opt)
+    skip = ('-------------- End ----------------',
+            '------------ Options -------------',
+            '\n')
+    print('Reading', opt_path)
+    with open(opt_path) as f:
+        for line in f:
+            if line.strip() not in skip:
+                # print(line.strip())
+                key, value = line.strip().split(': ')
+                if value in ('True', 'False'):
+                    opt_dict[key] = (value == 'True')
+                #     print(key, value)
+                elif is_float(value):
+                    opt_dict[key] = float(value)
+                elif is_number(value):
+                    opt_dict[key] = int(value)
+                else:
+                    opt_dict[key] = str(value)
+    # print(opt)
+    opt_dict['which_epoch'] = 'finest'
+    opt.save_root = pjoin(opt.checkpoints_dir, opt.dataset_name, opt.name)
+    opt.model_dir = pjoin(opt.save_root, 'model')
+    opt.meta_dir = pjoin(opt.save_root, 'meta')
+    if opt.dataset_name == 't2m':
+        opt.data_root = './dataset/HumanML3D/'
+        opt.motion_dir = pjoin(opt.data_root, 'new_joint_vecs')
+        opt.text_dir = pjoin(opt.data_root, 'texts')
+        opt.joints_num = 22
+        opt.dim_pose = 263
+        opt.max_motion_length = 196
+        opt.max_motion_frame = 196
+        opt.max_motion_token = 55
+    elif opt.dataset_name == 'kit':
+        opt.data_root = './dataset/KIT-ML/'
+        opt.motion_dir = pjoin(opt.data_root, 'new_joint_vecs')
+        opt.text_dir = pjoin(opt.data_root, 'texts')
+        opt.joints_num = 21
+        opt.dim_pose = 251
+        opt.max_motion_length = 196
+        opt.max_motion_frame = 196
+        opt.max_motion_token = 55
+    else:
+        raise KeyError('Dataset not recognized')
+    opt.dim_word = 300
+    opt.num_classes = 200 // opt.unit_length
+    opt.is_train = False
+    opt.is_continue = False
+    opt.device = device
+    return opt

options/option_transformer.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import argparse
+def get_args_parser():
+    parser = argparse.ArgumentParser(description='Optimal Transport AutoEncoder training for Amass',
+                                     add_help=True,
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    ## dataloader
+    parser.add_argument('--dataname', type=str, default='t2m', help='dataset directory')
+    parser.add_argument('--batch-size', default=128, type=int, help='batch size')
+    parser.add_argument('--fps', default=[20], nargs="+", type=int, help='frames per second')
+    parser.add_argument('--seq-len', type=int, default=64, help='training motion length')
+    ## optimization
+    parser.add_argument('--total-iter', default=300000, type=int, help='number of total iterations to run')
+    parser.add_argument('--warm-up-iter', default=1000, type=int, help='number of total iterations for warmup')
+    parser.add_argument('--lr', default=2e-4, type=float, help='max learning rate')
+    parser.add_argument('--lr-scheduler', default=[150000], nargs="+", type=int, help="learning rate schedule (iterations)")
+    parser.add_argument('--gamma', default=0.05, type=float, help="learning rate decay")
+    parser.add_argument('--weight-decay', default=1e-6, type=float, help='weight decay')
+    parser.add_argument('--decay-option',default='all', type=str, choices=['all', 'noVQ'], help='disable weight decay on codebook')
+    parser.add_argument('--optimizer',default='adamw', type=str, choices=['adam', 'adamw'], help='disable weight decay on codebook')
+    ## vqvae arch
+    parser.add_argument("--code-dim", type=int, default=32, help="embedding dimension")
+    parser.add_argument("--nb-code", type=int, default=8192, help="nb of embedding")
+    parser.add_argument("--mu", type=float, default=0.99, help="exponential moving average to update the codebook")
+    parser.add_argument("--down-t", type=int, default=2, help="downsampling rate")
+    parser.add_argument("--stride-t", type=int, default=2, help="stride size")
+    parser.add_argument("--width", type=int, default=512, help="width of the network")
+    parser.add_argument("--depth", type=int, default=3, help="depth of the network")
+    parser.add_argument("--dilation-growth-rate", type=int, default=3, help="dilation growth rate")
+    parser.add_argument("--output-emb-width", type=int, default=512, help="output embedding width")
+    parser.add_argument('--vq-act', type=str, default='relu', choices = ['relu', 'silu', 'gelu'], help='dataset directory')
+    ## gpt arch
+    parser.add_argument("--block-size", type=int, default=51, help="seq len")
+    parser.add_argument("--embed-dim-gpt", type=int, default=1024, help="embedding dimension")
+    parser.add_argument("--clip-dim", type=int, default=512, help="latent dimension in the clip feature")
+    parser.add_argument("--num-layers", type=int, default=9, help="nb of transformer layers")
+    parser.add_argument("--num-local-layer", type=int, default=2, help="nb of transformer local layers")
+    parser.add_argument("--n-head-gpt", type=int, default=16, help="nb of heads")
+    parser.add_argument("--ff-rate", type=int, default=4, help="feedforward size")
+    parser.add_argument("--drop-out-rate", type=float, default=0.1, help="dropout ratio in the pos encoding")
+    ## quantizer
+    parser.add_argument("--quantizer", type=str, default='ema_reset', choices = ['ema', 'orig', 'ema_reset', 'reset'], help="eps for optimal transport")
+    parser.add_argument('--quantbeta', type=float, default=1.0, help='dataset directory')
+    ## resume
+    parser.add_argument("--resume-pth", type=str, default=None, help='resume vq pth')
+    parser.add_argument("--resume-trans", type=str, default=None, help='resume gpt pth')
+    ## output directory
+    parser.add_argument('--out-dir', type=str, default='output', help='output directory')
+    parser.add_argument('--exp-name', type=str, default='exp_debug', help='name of the experiment, will create a file inside out-dir')
+    parser.add_argument('--vq-name', type=str, default='VQVAE', help='name of the generated dataset .npy, will create a file inside out-dir')
+    ## other
+    parser.add_argument('--print-iter', default=200, type=int, help='print frequency')
+    parser.add_argument('--eval-iter', default=10000, type=int, help='evaluation frequency')
+    parser.add_argument('--seed', default=123, type=int, help='seed for initializing training. ')
+    parser.add_argument("--if-maxtest", action='store_true', help="test in max")
+    parser.add_argument('--pkeep', type=float, default=.5, help='keep rate for gpt training')
+    ## generator
+    parser.add_argument('--text', type=str, help='text')
+    parser.add_argument('--length', type=int, help='length')
+    return parser.parse_args()

options/option_vq.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import argparse
+def get_args_parser():
+    parser = argparse.ArgumentParser(description='Optimal Transport AutoEncoder training for AIST',
+                                     add_help=True,
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    ## dataloader
+    parser.add_argument('--dataname', type=str, default='kit', help='dataset directory')
+    parser.add_argument('--batch-size', default=256, type=int, help='batch size')
+    parser.add_argument('--window-size', type=int, default=64, help='training motion length')
+    ## optimization
+    parser.add_argument('--total-iter', default=300000, type=int, help='number of total iterations to run')
+    parser.add_argument('--warm-up-iter', default=1000, type=int, help='number of total iterations for warmup')
+    parser.add_argument('--lr', default=2e-4, type=float, help='max learning rate')
+    parser.add_argument('--lr-scheduler', default=[200000], nargs="+", type=int, help="learning rate schedule (iterations)")
+    parser.add_argument('--gamma', default=0.05, type=float, help="learning rate decay")
+    parser.add_argument('--weight-decay', default=0.0, type=float, help='weight decay')
+    parser.add_argument("--commit", type=float, default=0.02, help="hyper-parameter for the commitment loss")
+    parser.add_argument('--loss-vel', type=float, default=0.5, help='hyper-parameter for the velocity loss')
+    parser.add_argument('--recons-loss', type=str, default='l1_smooth', help='reconstruction loss')
+    ## vqvae arch
+    parser.add_argument("--code-dim", type=int, default=32, help="embedding dimension")
+    parser.add_argument("--nb-code", type=int, default=8192, help="nb of embedding")
+    parser.add_argument("--mu", type=float, default=0.99, help="exponential moving average to update the codebook")
+    parser.add_argument("--down-t", type=int, default=2, help="downsampling rate")
+    parser.add_argument("--stride-t", type=int, default=2, help="stride size")
+    parser.add_argument("--width", type=int, default=512, help="width of the network")
+    parser.add_argument("--depth", type=int, default=3, help="depth of the network")
+    parser.add_argument("--dilation-growth-rate", type=int, default=3, help="dilation growth rate")
+    parser.add_argument("--output-emb-width", type=int, default=512, help="output embedding width")
+    parser.add_argument('--vq-act', type=str, default='relu', choices = ['relu', 'silu', 'gelu'], help='dataset directory')
+    parser.add_argument('--vq-norm', type=str, default=None, help='dataset directory')
+    ## quantizer
+    parser.add_argument("--quantizer", type=str, default='ema_reset', choices = ['ema', 'orig', 'ema_reset', 'reset'], help="eps for optimal transport")
+    parser.add_argument('--beta', type=float, default=1.0, help='commitment loss in standard VQ')
+    ## resume
+    parser.add_argument("--resume-pth", type=str, default=None, help='resume pth for VQ')
+    parser.add_argument("--resume-gpt", type=str, default=None, help='resume pth for GPT')
+    ## output directory
+    parser.add_argument('--out-dir', type=str, default='output', help='output directory')
+    parser.add_argument('--results-dir', type=str, default='visual_results/', help='output directory')
+    parser.add_argument('--visual-name', type=str, default='baseline', help='output directory')
+    parser.add_argument('--exp-name', type=str, default='exp_debug', help='name of the experiment, will create a file inside out-dir')
+    ## other
+    parser.add_argument('--print-iter', default=200, type=int, help='print frequency')
+    parser.add_argument('--eval-iter', default=5000, type=int, help='evaluation frequency')
+    parser.add_argument('--seed', default=123, type=int, help='seed for initializing training.')
+    parser.add_argument('--vis-gt', action='store_true', help='whether visualize GT motions')
+    parser.add_argument('--nb-vis', default=20, type=int, help='nb of visualizations')
+    parser.add_argument('--sep-uplow', action='store_true', help='whether visualize GT motions')
+    return parser.parse_args()

train_t2m_trans.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import os
+import torch
+import numpy as np
+from torch.utils.tensorboard import SummaryWriter
+from os.path import join as pjoin
+from torch.distributions import Categorical
+import json
+import clip
+import options.option_transformer as option_trans
+import models.vqvae as vqvae
+import utils.utils_model as utils_model
+import utils.eval_trans as eval_trans
+from dataset import dataset_TM_train
+from dataset import dataset_TM_eval
+from dataset import dataset_tokenize
+import models.t2m_trans as trans
+from options.get_eval_option import get_opt
+from models.evaluator_wrapper import EvaluatorModelWrapper
+import warnings
+warnings.filterwarnings('ignore')
+from exit.utils import get_model, visualize_2motions
+from tqdm import tqdm
+from exit.utils import get_model, visualize_2motions, generate_src_mask, init_save_folder, uniform, cosine_schedule
+from einops import rearrange, repeat
+import torch.nn.functional as F
+from exit.utils import base_dir
+##### ---- Exp dirs ---- #####
+args = option_trans.get_args_parser()
+torch.manual_seed(args.seed)
+# args.out_dir = os.path.join(args.out_dir, f'{args.exp_name}')
+init_save_folder(args)
+# [TODO] make the 'output/' folder as arg
+args.vq_dir = f'./output/vq/{args.vq_name}' #os.path.join("./dataset/KIT-ML" if args.dataname == 'kit' else "./dataset/HumanML3D", f'{args.vq_name}')
+codebook_dir = f'{args.vq_dir}/codebook/'
+args.resume_pth = f'{args.vq_dir}/net_last.pth'
+os.makedirs(args.vq_dir, exist_ok = True)
+os.makedirs(codebook_dir, exist_ok = True)
+os.makedirs(args.out_dir, exist_ok = True)
+os.makedirs(args.out_dir+'/html', exist_ok=True)
+##### ---- Logger ---- #####
+logger = utils_model.get_logger(args.out_dir)
+writer = SummaryWriter(args.out_dir)
+logger.info(json.dumps(vars(args), indent=4, sort_keys=True))
+from utils.word_vectorizer import WordVectorizer
+w_vectorizer = WordVectorizer('./glove', 'our_vab')
+val_loader = dataset_TM_eval.DATALoader(args.dataname, False, 32, w_vectorizer)
+dataset_opt_path = 'checkpoints/kit/Comp_v6_KLD005/opt.txt' if args.dataname == 'kit' else 'checkpoints/t2m/Comp_v6_KLD005/opt.txt'
+wrapper_opt = get_opt(dataset_opt_path, torch.device('cuda'))
+eval_wrapper = EvaluatorModelWrapper(wrapper_opt)
+##### ---- Network ---- #####
+clip_model, clip_preprocess = clip.load("ViT-B/32", device=torch.device('cuda'), jit=False)  # Must set jit=False for training
+clip.model.convert_weights(clip_model)  # Actually this line is unnecessary since clip by default already on float16
+clip_model.eval()
+for p in clip_model.parameters():
+    p.requires_grad = False
+# https://github.com/openai/CLIP/issues/111
+class TextCLIP(torch.nn.Module):
+    def __init__(self, model) :
+        super(TextCLIP, self).__init__()
+        self.model = model
+    def forward(self,text):
+        with torch.no_grad():
+            word_emb = self.model.token_embedding(text).type(self.model.dtype)
+            word_emb = word_emb + self.model.positional_embedding.type(self.model.dtype)
+            word_emb = word_emb.permute(1, 0, 2)  # NLD -> LND
+            word_emb = self.model.transformer(word_emb)
+            word_emb = self.model.ln_final(word_emb).permute(1, 0, 2).float()
+            enctxt = self.model.encode_text(text).float()
+        return enctxt, word_emb
+clip_model = TextCLIP(clip_model)
+net = vqvae.HumanVQVAE(args, ## use args to define different parameters in different quantizers
+                       args.nb_code,
+                       args.code_dim,
+                       args.output_emb_width,
+                       args.down_t,
+                       args.stride_t,
+                       args.width,
+                       args.depth,
+                       args.dilation_growth_rate)
+trans_encoder = trans.Text2Motion_Transformer(vqvae=net,
+                                num_vq=args.nb_code,
+                                embed_dim=args.embed_dim_gpt,
+                                clip_dim=args.clip_dim,
+                                block_size=args.block_size,
+                                num_layers=args.num_layers,
+                                num_local_layer=args.num_local_layer,
+                                n_head=args.n_head_gpt,
+                                drop_out_rate=args.drop_out_rate,
+                                fc_rate=args.ff_rate)
+print ('loading checkpoint from {}'.format(args.resume_pth))
+ckpt = torch.load(args.resume_pth, map_location='cpu')
+net.load_state_dict(ckpt['net'], strict=True)
+net.eval()
+net.cuda()
+if args.resume_trans is not None:
+    print ('loading transformer checkpoint from {}'.format(args.resume_trans))
+    ckpt = torch.load(args.resume_trans, map_location='cpu')
+    trans_encoder.load_state_dict(ckpt['trans'], strict=True)
+trans_encoder.train()
+trans_encoder.cuda()
+trans_encoder = torch.nn.DataParallel(trans_encoder)
+##### ---- Optimizer & Scheduler ---- #####
+optimizer = utils_model.initial_optim(args.decay_option, args.lr, args.weight_decay, trans_encoder, args.optimizer)
+scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_scheduler, gamma=args.gamma)
+##### ---- Optimization goals ---- #####
+loss_ce = torch.nn.CrossEntropyLoss(reduction='none')
+##### ---- get code ---- #####
+##### ---- Dataloader ---- #####
+if len(os.listdir(codebook_dir)) == 0:
+    train_loader_token = dataset_tokenize.DATALoader(args.dataname, 1, unit_length=2**args.down_t)
+    for batch in train_loader_token:
+        pose, name = batch
+        bs, seq = pose.shape[0], pose.shape[1]
+        pose = pose.cuda().float() # bs, nb_joints, joints_dim, seq_len
+        target = net(pose, type='encode')
+        target = target.cpu().numpy()
+        np.save(pjoin(codebook_dir, name[0] +'.npy'), target)
+train_loader = dataset_TM_train.DATALoader(args.dataname, args.batch_size, args.nb_code, codebook_dir, unit_length=2**args.down_t)
+train_loader_iter = dataset_TM_train.cycle(train_loader)
+##### ---- Training ---- #####
+best_fid=1000
+best_iter=0
+best_div=100
+best_top1=0
+best_top2=0
+best_top3=0
+best_matching=100
+# pred_pose_eval, pose, m_length, clip_text, best_fid, best_iter, best_div, best_top1, best_top2, best_top3, best_matching, best_multi, writer, logger = eval_trans.evaluation_transformer(args.out_dir, val_loader, net, trans_encoder, logger, writer, 0, best_fid=1000, best_iter=0, best_div=100, best_top1=0, best_top2=0, best_top3=0, best_matching=100, clip_model=clip_model, eval_wrapper=eval_wrapper)
+def get_acc(cls_pred, target, mask):
+    cls_pred = torch.masked_select(cls_pred, mask.unsqueeze(-1)).view(-1, cls_pred.shape[-1])
+    target_all = torch.masked_select(target, mask)
+    probs = torch.softmax(cls_pred, dim=-1)
+    _, cls_pred_index = torch.max(probs, dim=-1)
+    right_num = (cls_pred_index == target_all).sum()
+    return right_num*100/mask.sum()
+# while nb_iter <= args.total_iter:
+for nb_iter in tqdm(range(1, args.total_iter + 1), position=0, leave=True):
+    batch = next(train_loader_iter)
+    clip_text, m_tokens, m_tokens_len = batch
+    m_tokens, m_tokens_len = m_tokens.cuda(), m_tokens_len.cuda()
+    bs = m_tokens.shape[0]
+    target = m_tokens    # (bs, 26)
+    target = target.cuda()
+    batch_size, max_len = target.shape[:2]
+    # Random Drop Text
+    # text_mask = np.random.random(len(clip_text)) > .05
+    # clip_text = np.array(clip_text)
+    # clip_text[~text_mask] = ''
+    text = clip.tokenize(clip_text, truncate=True).cuda()
+    feat_clip_text, word_emb = clip_model(text)
+    # [INFO] Swap input tokens
+    if args.pkeep == -1:
+        proba = np.random.rand(1)[0]
+        mask = torch.bernoulli(proba * torch.ones(target.shape,
+                                                device=target.device))
+    else:
+        mask = torch.bernoulli(args.pkeep * torch.ones(target.shape,
+                                                device=target.device))
+    # random only motion token (not pad token). To prevent pad token got mixed up.
+    seq_mask_no_end = generate_src_mask(max_len, m_tokens_len)
+    mask = torch.logical_or(mask, ~seq_mask_no_end).int()
+    r_indices = torch.randint_like(target, args.nb_code)
+    input_indices = mask*target+(1-mask)*r_indices
+    # Time step masking
+    mask_id = get_model(net).vqvae.num_code + 2
+    # rand_time = uniform((batch_size,), device = target.device)
+    # rand_mask_probs = cosine_schedule(rand_time)
+    rand_mask_probs = torch.zeros(batch_size, device = m_tokens_len.device).float().uniform_(0.5, 1)
+    # rand_mask_probs = cosine_schedule(rand_mask_probs)
+    num_token_masked = (m_tokens_len * rand_mask_probs).round().clamp(min = 1)
+    seq_mask = generate_src_mask(max_len, m_tokens_len+1)
+    batch_randperm = torch.rand((batch_size, max_len), device = target.device) - seq_mask_no_end.int()
+    batch_randperm = batch_randperm.argsort(dim = -1)
+    mask_token = batch_randperm < rearrange(num_token_masked, 'b -> b 1')
+    # masked_target = torch.where(mask_token, input=input_indices, other=-1)
+    masked_input_indices = torch.where(mask_token, mask_id, input_indices)
+    att_txt = None # CFG: torch.rand((seq_mask.shape[0], 1)) > 0.1
+    cls_pred = trans_encoder(masked_input_indices, feat_clip_text, src_mask = seq_mask, att_txt=att_txt, word_emb=word_emb)[:, 1:]
+    # [INFO] Compute xent loss as a batch
+    weights = seq_mask_no_end / (seq_mask_no_end.sum(-1).unsqueeze(-1) * seq_mask_no_end.shape[0])
+    cls_pred_seq_masked = cls_pred[seq_mask_no_end, :].view(-1, cls_pred.shape[-1])
+    target_seq_masked = target[seq_mask_no_end]
+    weight_seq_masked = weights[seq_mask_no_end]
+    loss_cls = F.cross_entropy(cls_pred_seq_masked, target_seq_masked, reduction = 'none')
+    loss_cls = (loss_cls * weight_seq_masked).sum()
+    ## global loss
+    optimizer.zero_grad()
+    loss_cls.backward()
+    optimizer.step()
+    scheduler.step()
+    if nb_iter % args.print_iter ==  0 :
+        probs_seq_masked = torch.softmax(cls_pred_seq_masked, dim=-1)
+        _, cls_pred_seq_masked_index = torch.max(probs_seq_masked, dim=-1)
+        target_seq_masked = torch.masked_select(target, seq_mask_no_end)
+        right_seq_masked = (cls_pred_seq_masked_index == target_seq_masked).sum()
+        writer.add_scalar('./Loss/all', loss_cls, nb_iter)
+        writer.add_scalar('./ACC/every_token', right_seq_masked*100/seq_mask_no_end.sum(), nb_iter)
+        # [INFO] log mask/nomask separately
+        no_mask_token = ~mask_token * seq_mask_no_end
+        writer.add_scalar('./ACC/masked', get_acc(cls_pred, target, mask_token), nb_iter)
+        writer.add_scalar('./ACC/no_masked', get_acc(cls_pred, target, no_mask_token), nb_iter)
+        # msg = f"Train. Iter {nb_iter} : Loss. {avg_loss_cls:.5f}, ACC. {avg_acc:.4f}"
+        # logger.info(msg)
+    if nb_iter==0 or nb_iter % args.eval_iter ==  0 or nb_iter == args.total_iter:
+        num_repeat = 1
+        rand_pos = False
+        if nb_iter == args.total_iter:
+            num_repeat = -30
+            rand_pos = True
+            val_loader = dataset_TM_eval.DATALoader(args.dataname, True, 32, w_vectorizer)
+        pred_pose_eval, pose, m_length, clip_text, best_fid, best_iter, best_div, best_top1, best_top2, best_top3, best_matching, best_multi, writer, logger = eval_trans.evaluation_transformer(args.out_dir, val_loader, net, trans_encoder, logger, writer, nb_iter, best_fid, best_iter, best_div, best_top1, best_top2, best_top3, best_matching, clip_model=clip_model, eval_wrapper=eval_wrapper, dataname=args.dataname, num_repeat=num_repeat, rand_pos=rand_pos)
+        # for i in range(4):
+        #     x = pose[i].detach().cpu().numpy()
+        #     y = pred_pose_eval[i].detach().cpu().numpy()
+        #     l = m_length[i]
+        #     caption = clip_text[i]
+        #     cleaned_name = '-'.join(caption[:200].split('/'))
+        #     visualize_2motions(x, val_loader.dataset.std, val_loader.dataset.mean, args.dataname, l, y, save_path=f'{args.out_dir}/html/{str(nb_iter)}_{cleaned_name}_{l}.html')
+    if nb_iter == args.total_iter:
+        msg_final = f"Train. Iter {best_iter} : FID. {best_fid:.5f}, Diversity. {best_div:.4f}, TOP1. {best_top1:.4f}, TOP2. {best_top2:.4f}, TOP3. {best_top3:.4f}"
+        logger.info(msg_final)
+        break

train_vq.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import os
+import json
+import torch
+import torch.optim as optim
+from torch.utils.tensorboard import SummaryWriter
+import models.vqvae as vqvae
+import utils.losses as losses
+import options.option_vq as option_vq
+import utils.utils_model as utils_model
+from dataset import dataset_VQ, dataset_TM_eval
+import utils.eval_trans as eval_trans
+from options.get_eval_option import get_opt
+from models.evaluator_wrapper import EvaluatorModelWrapper
+import warnings
+warnings.filterwarnings('ignore')
+from utils.word_vectorizer import WordVectorizer
+from tqdm import tqdm
+from exit.utils import get_model, generate_src_mask, init_save_folder
+from models.vqvae_sep import VQVAE_SEP
+def update_lr_warm_up(optimizer, nb_iter, warm_up_iter, lr):
+    current_lr = lr * (nb_iter + 1) / (warm_up_iter + 1)
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = current_lr
+    return optimizer, current_lr
+##### ---- Exp dirs ---- #####
+args = option_vq.get_args_parser()
+torch.manual_seed(args.seed)
+args.out_dir = os.path.join(args.out_dir, f'vq') # /{args.exp_name}
+# os.makedirs(args.out_dir, exist_ok = True)
+init_save_folder(args)
+##### ---- Logger ---- #####
+logger = utils_model.get_logger(args.out_dir)
+writer = SummaryWriter(args.out_dir)
+logger.info(json.dumps(vars(args), indent=4, sort_keys=True))
+w_vectorizer = WordVectorizer('./glove', 'our_vab')
+if args.dataname == 'kit' :
+    dataset_opt_path = 'checkpoints/kit/Comp_v6_KLD005/opt.txt'
+    args.nb_joints = 21
+else :
+    dataset_opt_path = 'checkpoints/t2m/Comp_v6_KLD005/opt.txt'
+    args.nb_joints = 22
+logger.info(f'Training on {args.dataname}, motions are with {args.nb_joints} joints')
+wrapper_opt = get_opt(dataset_opt_path, torch.device('cuda'))
+eval_wrapper = EvaluatorModelWrapper(wrapper_opt)
+##### ---- Dataloader ---- #####
+train_loader = dataset_VQ.DATALoader(args.dataname,
+                                        args.batch_size,
+                                        window_size=args.window_size,
+                                        unit_length=2**args.down_t)
+train_loader_iter = dataset_VQ.cycle(train_loader)
+val_loader = dataset_TM_eval.DATALoader(args.dataname, False,
+                                        32,
+                                        w_vectorizer,
+                                        unit_length=2**args.down_t)
+##### ---- Network ---- #####
+if args.sep_uplow:
+    net = VQVAE_SEP(args, ## use args to define different parameters in different quantizers
+                        args.nb_code,
+                        args.code_dim,
+                        args.output_emb_width,
+                        args.down_t,
+                        args.stride_t,
+                        args.width,
+                        args.depth,
+                        args.dilation_growth_rate,
+                        args.vq_act,
+                        args.vq_norm,
+                        {'mean': torch.from_numpy(train_loader.dataset.mean).cuda().float(),
+                        'std': torch.from_numpy(train_loader.dataset.std).cuda().float()},
+                        True)
+else:
+    net = vqvae.HumanVQVAE(args, ## use args to define different parameters in different quantizers
+                        args.nb_code,
+                        args.code_dim,
+                        args.output_emb_width,
+                        args.down_t,
+                        args.stride_t,
+                        args.width,
+                        args.depth,
+                        args.dilation_growth_rate,
+                        args.vq_act,
+                        args.vq_norm)
+if args.resume_pth :
+    logger.info('loading checkpoint from {}'.format(args.resume_pth))
+    ckpt = torch.load(args.resume_pth, map_location='cpu')
+    net.load_state_dict(ckpt['net'], strict=True)
+net.train()
+net.cuda()
+##### ---- Optimizer & Scheduler ---- #####
+optimizer = optim.AdamW(net.parameters(), lr=args.lr, betas=(0.9, 0.99), weight_decay=args.weight_decay)
+scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_scheduler, gamma=args.gamma)
+Loss = losses.ReConsLoss(args.recons_loss, args.nb_joints)
+##### ------ warm-up ------- #####
+avg_recons, avg_perplexity, avg_commit = 0., 0., 0.
+for nb_iter in tqdm(range(1, args.warm_up_iter)):
+    optimizer, current_lr = update_lr_warm_up(optimizer, nb_iter, args.warm_up_iter, args.lr)
+    gt_motion = next(train_loader_iter)
+    gt_motion = gt_motion.cuda().float() # (bs, 64, dim)
+    pred_motion, loss_commit, perplexity = net(gt_motion)
+    loss_motion = Loss(pred_motion, gt_motion)
+    loss_vel = Loss.forward_joint(pred_motion, gt_motion)
+    loss = loss_motion + args.commit * loss_commit + args.loss_vel * loss_vel
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
+    avg_recons += loss_motion.item()
+    avg_perplexity += perplexity.item()
+    avg_commit += loss_commit.item()
+    if nb_iter % args.print_iter ==  0 :
+        avg_recons /= args.print_iter
+        avg_perplexity /= args.print_iter
+        avg_commit /= args.print_iter
+        logger.info(f"Warmup. Iter {nb_iter} :  lr {current_lr:.5f} \t Commit. {avg_commit:.5f} \t PPL. {avg_perplexity:.2f} \t Recons.  {avg_recons:.5f}")
+        avg_recons, avg_perplexity, avg_commit = 0., 0., 0.
+##### ---- Training ---- #####
+avg_recons, avg_perplexity, avg_commit = 0., 0., 0.
+best_fid, best_iter, best_div, best_top1, best_top2, best_top3, best_matching, writer, logger = eval_trans.evaluation_vqvae(args.out_dir, val_loader, net, logger, writer, 0, best_fid=1000, best_iter=0, best_div=100, best_top1=0, best_top2=0, best_top3=0, best_matching=100, eval_wrapper=eval_wrapper)
+for nb_iter in tqdm(range(1, args.total_iter + 1)):
+    gt_motion = next(train_loader_iter)
+    gt_motion = gt_motion.cuda().float() # bs, nb_joints, joints_dim, seq_len
+    if args.sep_uplow:
+        pred_motion, loss_commit, perplexity = net(gt_motion, idx_noise=0)
+    else:
+        pred_motion, loss_commit, perplexity = net(gt_motion)
+    loss_motion = Loss(pred_motion, gt_motion)
+    loss_vel = Loss.forward_joint(pred_motion, gt_motion)
+    loss = loss_motion + args.commit * loss_commit + args.loss_vel * loss_vel
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
+    scheduler.step()
+    avg_recons += loss_motion.item()
+    avg_perplexity += perplexity.item()
+    avg_commit += loss_commit.item()
+    if nb_iter % args.print_iter ==  0 :
+        avg_recons /= args.print_iter
+        avg_perplexity /= args.print_iter
+        avg_commit /= args.print_iter
+        writer.add_scalar('./Train/L1', avg_recons, nb_iter)
+        writer.add_scalar('./Train/PPL', avg_perplexity, nb_iter)
+        writer.add_scalar('./Train/Commit', avg_commit, nb_iter)
+        logger.info(f"Train. Iter {nb_iter} : \t Commit. {avg_commit:.5f} \t PPL. {avg_perplexity:.2f} \t Recons.  {avg_recons:.5f}")
+        avg_recons, avg_perplexity, avg_commit = 0., 0., 0.,
+    if nb_iter % args.eval_iter==0 :
+        best_fid, best_iter, best_div, best_top1, best_top2, best_top3, best_matching, writer, logger = eval_trans.evaluation_vqvae(args.out_dir, val_loader, net, logger, writer, nb_iter, best_fid, best_iter, best_div, best_top1, best_top2, best_top3, best_matching, eval_wrapper=eval_wrapper)

utils/eval_trans.py ADDED Viewed

	@@ -0,0 +1,824 @@

+import os
+import clip
+import numpy as np
+import torch
+from scipy import linalg
+# import visualization.plot_3d_global as plot_3d
+from utils.motion_process import recover_from_ric
+from exit.utils import get_model, visualize_2motions, generate_src_mask
+from tqdm import tqdm
+def tensorborad_add_video_xyz(writer, xyz, nb_iter, tag, nb_vis=4, title_batch=None, outname=None):
+    xyz = xyz[:1]
+    bs, seq = xyz.shape[:2]
+    xyz = xyz.reshape(bs, seq, -1, 3)
+    plot_xyz = plot_3d.draw_to_batch(xyz.cpu().numpy(),title_batch, outname)
+    plot_xyz =np.transpose(plot_xyz, (0, 1, 4, 2, 3))
+    writer.add_video(tag, plot_xyz, nb_iter, fps = 20)
+@torch.no_grad()
+def evaluation_vqvae(out_dir, val_loader, net, logger, writer, nb_iter, best_fid, best_iter, best_div, best_top1, best_top2, best_top3, best_matching, eval_wrapper, draw = True, save = True, savegif=False, savenpy=False) :
+    net.eval()
+    nb_sample = 0
+    draw_org = []
+    draw_pred = []
+    draw_text = []
+    motion_annotation_list = []
+    motion_pred_list = []
+    R_precision_real = 0
+    R_precision = 0
+    nb_sample = 0
+    matching_score_real = 0
+    matching_score_pred = 0
+    for batch in val_loader:
+        word_embeddings, pos_one_hots, caption, sent_len, motion, m_length, token, name = batch
+        motion = motion.cuda()
+        et, em = eval_wrapper.get_co_embeddings(word_embeddings, pos_one_hots, sent_len, motion, m_length)
+        bs, seq = motion.shape[0], motion.shape[1]
+        num_joints = 21 if motion.shape[-1] == 251 else 22
+        pred_pose_eval = torch.zeros((bs, seq, motion.shape[-1])).cuda()
+        for i in range(bs):
+            pose = val_loader.dataset.inv_transform(motion[i:i+1, :m_length[i], :].detach().cpu().numpy())
+            # pose_xyz = recover_from_ric(torch.from_numpy(pose).float().cuda(), num_joints)
+            pred_pose, loss_commit, perplexity = net(motion[i:i+1, :m_length[i]])
+            # pred_denorm = val_loader.dataset.inv_transform(pred_pose.detach().cpu().numpy())
+            # pred_xyz = recover_from_ric(torch.from_numpy(pred_denorm).float().cuda(), num_joints)
+            # if savenpy:
+            #     np.save(os.path.join(out_dir, name[i]+'_gt.npy'), pose_xyz[:, :m_length[i]].cpu().numpy())
+            #     np.save(os.path.join(out_dir, name[i]+'_pred.npy'), pred_xyz.detach().cpu().numpy())
+            pred_pose_eval[i:i+1,:m_length[i],:] = pred_pose
+            # if i < min(4, bs):
+            #     draw_org.append(pose_xyz)
+            #     draw_pred.append(pred_xyz)
+            #     draw_text.append(caption[i])
+        et_pred, em_pred = eval_wrapper.get_co_embeddings(word_embeddings, pos_one_hots, sent_len, pred_pose_eval, m_length)
+        motion_pred_list.append(em_pred)
+        motion_annotation_list.append(em)
+        temp_R, temp_match = calculate_R_precision(et.cpu().numpy(), em.cpu().numpy(), top_k=3, sum_all=True)
+        R_precision_real += temp_R
+        matching_score_real += temp_match
+        temp_R, temp_match = calculate_R_precision(et_pred.cpu().numpy(), em_pred.cpu().numpy(), top_k=3, sum_all=True)
+        R_precision += temp_R
+        matching_score_pred += temp_match
+        nb_sample += bs
+    motion_annotation_np = torch.cat(motion_annotation_list, dim=0).cpu().numpy()
+    motion_pred_np = torch.cat(motion_pred_list, dim=0).cpu().numpy()
+    gt_mu, gt_cov  = calculate_activation_statistics(motion_annotation_np)
+    mu, cov= calculate_activation_statistics(motion_pred_np)
+    diversity_real = calculate_diversity(motion_annotation_np, 300 if nb_sample > 300 else 100)
+    diversity = calculate_diversity(motion_pred_np, 300 if nb_sample > 300 else 100)
+    R_precision_real = R_precision_real / nb_sample
+    R_precision = R_precision / nb_sample
+    matching_score_real = matching_score_real / nb_sample
+    matching_score_pred = matching_score_pred / nb_sample
+    fid = calculate_frechet_distance(gt_mu, gt_cov, mu, cov)
+    msg = f"--> \t Eva. Iter {nb_iter} :, FID. {fid:.4f}, Diversity Real. {diversity_real:.4f}, Diversity. {diversity:.4f}, R_precision_real. {R_precision_real}, R_precision. {R_precision}, matching_score_real. {matching_score_real}, matching_score_pred. {matching_score_pred}"
+    logger.info(msg)
+    if draw:
+        writer.add_scalar('./Test/FID', fid, nb_iter)
+        writer.add_scalar('./Test/Diversity', diversity, nb_iter)
+        writer.add_scalar('./Test/top1', R_precision[0], nb_iter)
+        writer.add_scalar('./Test/top2', R_precision[1], nb_iter)
+        writer.add_scalar('./Test/top3', R_precision[2], nb_iter)
+        writer.add_scalar('./Test/matching_score', matching_score_pred, nb_iter)
+        # if nb_iter % 5000 == 0 :
+        #     for ii in range(4):
+        #         tensorborad_add_video_xyz(writer, draw_org[ii], nb_iter, tag='./Vis/org_eval'+str(ii), nb_vis=1, title_batch=[draw_text[ii]], outname=[os.path.join(out_dir, 'gt'+str(ii)+'.gif')] if savegif else None)
+        # if nb_iter % 5000 == 0 :
+        #     for ii in range(4):
+        #         tensorborad_add_video_xyz(writer, draw_pred[ii], nb_iter, tag='./Vis/pred_eval'+str(ii), nb_vis=1, title_batch=[draw_text[ii]], outname=[os.path.join(out_dir, 'pred'+str(ii)+'.gif')] if savegif else None)
+    if fid < best_fid :
+        msg = f"--> --> \t FID Improved from {best_fid:.5f} to {fid:.5f} !!!"
+        logger.info(msg)
+        best_fid, best_iter = fid, nb_iter
+        # if save:
+        #     torch.save({'net' : net.state_dict()}, os.path.join(out_dir, 'net_best_fid.pth'))
+    if abs(diversity_real - diversity) < abs(diversity_real - best_div) :
+        msg = f"--> --> \t Diversity Improved from {best_div:.5f} to {diversity:.5f} !!!"
+        logger.info(msg)
+        best_div = diversity
+        # if save:
+        #     torch.save({'net' : net.state_dict()}, os.path.join(out_dir, 'net_best_div.pth'))
+    if R_precision[0] > best_top1 :
+        msg = f"--> --> \t Top1 Improved from {best_top1:.4f} to {R_precision[0]:.4f} !!!"
+        logger.info(msg)
+        best_top1 = R_precision[0]
+        # if save:
+        #     torch.save({'net' : net.state_dict()}, os.path.join(out_dir, 'net_best_top1.pth'))
+    if R_precision[1] > best_top2 :
+        msg = f"--> --> \t Top2 Improved from {best_top2:.4f} to {R_precision[1]:.4f} !!!"
+        logger.info(msg)
+        best_top2 = R_precision[1]
+    if R_precision[2] > best_top3 :
+        msg = f"--> --> \t Top3 Improved from {best_top3:.4f} to {R_precision[2]:.4f} !!!"
+        logger.info(msg)
+        best_top3 = R_precision[2]
+    if matching_score_pred < best_matching :
+        msg = f"--> --> \t matching_score Improved from {best_matching:.5f} to {matching_score_pred:.5f} !!!"
+        logger.info(msg)
+        best_matching = matching_score_pred
+        # if save:
+        #     torch.save({'net' : net.state_dict()}, os.path.join(out_dir, 'net_best_matching.pth'))
+    if save:
+        torch.save({'net' : net.state_dict()}, os.path.join(out_dir, 'net_last.pth'))
+    net.train()
+    return best_fid, best_iter, best_div, best_top1, best_top2, best_top3, best_matching, writer, logger
+@torch.no_grad()
+def evaluation_transformer(out_dir, val_loader, net, trans, logger, writer, nb_iter, best_fid, best_iter, best_div, best_top1, best_top2, best_top3, best_matching, clip_model, eval_wrapper, dataname='t2m', draw = True, save = True, savegif=False, num_repeat=1, rand_pos=False, CFG=-1) :
+    if num_repeat < 0:
+        is_avg_all = True
+        num_repeat = -num_repeat
+    else:
+        is_avg_all = False
+    trans.eval()
+    nb_sample = 0
+    draw_org = []
+    draw_pred = []
+    draw_text = []
+    draw_text_pred = []
+    motion_annotation_list = []
+    motion_pred_list = []
+    motion_multimodality = []
+    R_precision_real = 0
+    R_precision = 0
+    matching_score_real = 0
+    matching_score_pred = 0
+    nb_sample = 0
+    blank_id = get_model(trans).num_vq
+    for batch in tqdm(val_loader):
+        word_embeddings, pos_one_hots, clip_text, sent_len, pose, m_length, token, name = batch
+        bs, seq = pose.shape[:2]
+        num_joints = 21 if pose.shape[-1] == 251 else 22
+        text = clip.tokenize(clip_text, truncate=True).cuda()
+        feat_clip_text, word_emb = clip_model(text)
+        motion_multimodality_batch = []
+        m_tokens_len = torch.ceil((m_length)/4)
+        pred_len = m_length.cuda()
+        pred_tok_len = m_tokens_len
+        for i in range(num_repeat):
+            pred_pose_eval = torch.zeros((bs, seq, pose.shape[-1])).cuda()
+            # pred_len = torch.ones(bs).long()
+            index_motion = trans(feat_clip_text, word_emb, type="sample", m_length=pred_len, rand_pos=rand_pos, CFG=CFG)
+            # [INFO] 1. this get the last index of blank_id
+            # pred_length = (index_motion == blank_id).int().argmax(1).float()
+            # [INFO] 2. this get the first index of blank_id
+            pred_length = (index_motion >= blank_id).int()
+            pred_length = torch.topk(pred_length, k=1, dim=1).indices.squeeze().float()
+            # pred_length[pred_length==0] = index_motion.shape[1] # if blank_id in the first frame, set length to max
+            # [INFO] need to run single sample at a time b/c it's conv
+            for k in range(bs):
+            ######### [INFO] Eval only the predicted length
+            #     if pred_length[k] == 0:
+            #         pred_len[k] = seq
+            #         continue
+            #     pred_pose = net(index_motion[k:k+1, :int(pred_length[k].item())], type='decode')
+            #     cur_len = pred_pose.shape[1]
+            #     pred_len[k] = min(cur_len, seq)
+            #     pred_pose_eval[k:k+1, :cur_len] = pred_pose[:, :seq]
+            # et_pred, em_pred = eval_wrapper.get_co_embeddings(word_embeddings, pos_one_hots, sent_len, pred_pose_eval, pred_len)
+            ######################################################
+            ######### [INFO] Eval by m_length
+                pred_pose = net(index_motion[k:k+1, :int(pred_tok_len[k].item())], type='decode')
+                pred_pose_eval[k:k+1, :int(pred_len[k].item())] = pred_pose
+            et_pred, em_pred = eval_wrapper.get_co_embeddings(word_embeddings, pos_one_hots, sent_len, pred_pose_eval, m_length)
+            ######################################################
+            motion_multimodality_batch.append(em_pred.reshape(bs, 1, -1))
+            if i == 0 or is_avg_all:
+                pose = pose.cuda().float()
+                et, em = eval_wrapper.get_co_embeddings(word_embeddings, pos_one_hots, sent_len, pose, m_length)
+                motion_annotation_list.append(em)
+                motion_pred_list.append(em_pred)
+                # if draw:
+                #     pose = val_loader.dataset.inv_transform(pose.detach().cpu().numpy())
+                #     pose_xyz = recover_from_ric(torch.from_numpy(pose).float().cuda(), num_joints)
+                #     for j in range(min(4, bs)):
+                #         draw_org.append(pose_xyz[j][:m_length[j]].unsqueeze(0))
+                #         draw_text.append(clip_text[j])
+                temp_R, temp_match = calculate_R_precision(et.cpu().numpy(), em.cpu().numpy(), top_k=3, sum_all=True)
+                R_precision_real += temp_R
+                matching_score_real += temp_match
+                temp_R, temp_match = calculate_R_precision(et_pred.cpu().numpy(), em_pred.cpu().numpy(), top_k=3, sum_all=True)
+                R_precision += temp_R
+                matching_score_pred += temp_match
+                nb_sample += bs
+        motion_multimodality.append(torch.cat(motion_multimodality_batch, dim=1))
+    motion_annotation_np = torch.cat(motion_annotation_list, dim=0).cpu().numpy()
+    motion_pred_np = torch.cat(motion_pred_list, dim=0).cpu().numpy()
+    gt_mu, gt_cov  = calculate_activation_statistics(motion_annotation_np)
+    mu, cov= calculate_activation_statistics(motion_pred_np)
+    diversity_real = calculate_diversity(motion_annotation_np, 300 if nb_sample > 300 else 100)
+    diversity = calculate_diversity(motion_pred_np, 300 if nb_sample > 300 else 100)
+    R_precision_real = R_precision_real / nb_sample
+    R_precision = R_precision / nb_sample
+    matching_score_real = matching_score_real / nb_sample
+    matching_score_pred = matching_score_pred / nb_sample
+    multimodality = 0
+    motion_multimodality = torch.cat(motion_multimodality, dim=0).cpu().numpy()
+    if num_repeat > 1:
+        multimodality = calculate_multimodality(motion_multimodality, 10)
+    fid = calculate_frechet_distance(gt_mu, gt_cov, mu, cov)
+    msg = f"--> \t Eva. Iter {nb_iter} :, \n\
+                FID. {fid:.4f} , \n\
+                Diversity Real. {diversity_real:.4f}, \n\
+                Diversity. {diversity:.4f}, \n\
+                R_precision_real. {R_precision_real}, \n\
+                R_precision. {R_precision}, \n\
+                matching_score_real. {matching_score_real}, \n\
+                matching_score_pred. {matching_score_pred}, \n\
+                multimodality. {multimodality:.4f}"
+    logger.info(msg)
+    if draw:
+        writer.add_scalar('./Test/FID', fid, nb_iter)
+        writer.add_scalar('./Test/Diversity', diversity, nb_iter)
+        writer.add_scalar('./Test/top1', R_precision[0], nb_iter)
+        writer.add_scalar('./Test/top2', R_precision[1], nb_iter)
+        writer.add_scalar('./Test/top3', R_precision[2], nb_iter)
+        writer.add_scalar('./Test/matching_score', matching_score_pred, nb_iter)
+        writer.add_scalar('./Test/multimodality', multimodality, nb_iter)
+        # if nb_iter % 10000 == 0 :
+        #     for ii in range(4):
+        #         tensorborad_add_video_xyz(writer, draw_org[ii], nb_iter, tag='./Vis/org_eval'+str(ii), nb_vis=1, title_batch=[draw_text[ii]], outname=[os.path.join(out_dir, 'gt'+str(ii)+'.gif')] if savegif else None)
+        # if nb_iter % 10000 == 0 :
+        #     for ii in range(4):
+        #         tensorborad_add_video_xyz(writer, draw_pred[ii], nb_iter, tag='./Vis/pred_eval'+str(ii), nb_vis=1, title_batch=[draw_text_pred[ii]], outname=[os.path.join(out_dir, 'pred'+str(ii)+'.gif')] if savegif else None)
+    if fid < best_fid :
+        msg = f"--> --> \t FID Improved from {best_fid:.5f} to {fid:.5f} !!!"
+        logger.info(msg)
+        best_fid, best_iter = fid, nb_iter
+        # if save:
+        #     torch.save({'trans' : get_model(trans).state_dict()}, os.path.join(out_dir, 'net_best_fid.pth'))
+    if matching_score_pred < best_matching :
+        msg = f"--> --> \t matching_score Improved from {best_matching:.5f} to {matching_score_pred:.5f} !!!"
+        logger.info(msg)
+        best_matching = matching_score_pred
+    if abs(diversity_real - diversity) < abs(diversity_real - best_div) :
+        msg = f"--> --> \t Diversity Improved from {best_div:.5f} to {diversity:.5f} !!!"
+        logger.info(msg)
+        best_div = diversity
+    if R_precision[0] > best_top1 :
+        msg = f"--> --> \t Top1 Improved from {best_top1:.4f} to {R_precision[0]:.4f} !!!"
+        logger.info(msg)
+        best_top1 = R_precision[0]
+    if R_precision[1] > best_top2 :
+        msg = f"--> --> \t Top2 Improved from {best_top2:.4f} to {R_precision[1]:.4f} !!!"
+        logger.info(msg)
+        best_top2 = R_precision[1]
+    if R_precision[2] > best_top3 :
+        msg = f"--> --> \t Top3 Improved from {best_top3:.4f} to {R_precision[2]:.4f} !!!"
+        logger.info(msg)
+        best_top3 = R_precision[2]
+    if save:
+        torch.save({'trans' : get_model(trans).state_dict()}, os.path.join(out_dir, 'net_last.pth'))
+    trans.train()
+    return pred_pose_eval, pose, m_length, clip_text, best_fid, best_iter, best_div, best_top1, best_top2, best_top3, best_matching, multimodality, writer, logger
+def evaluation_transformer_uplow(out_dir, val_loader, net, trans, logger, writer, nb_iter, best_fid, best_iter, best_div, best_top1, best_top2, best_top3, best_matching, clip_model, eval_wrapper, dataname, draw = True, save = True, savegif=False, num_repeat=1, rand_pos=False, CFG=-1) :
+    from utils.humanml_utils import HML_UPPER_BODY_MASK, HML_LOWER_BODY_MASK
+    trans.eval()
+    nb_sample = 0
+    draw_org = []
+    draw_pred = []
+    draw_text = []
+    draw_text_pred = []
+    motion_annotation_list = []
+    motion_pred_list = []
+    motion_multimodality = []
+    R_precision_real = 0
+    R_precision = 0
+    matching_score_real = 0
+    matching_score_pred = 0
+    nb_sample = 0
+    blank_id = get_model(trans).num_vq
+    for batch in tqdm(val_loader):
+        word_embeddings, pos_one_hots, clip_text, sent_len, pose, m_length, token, name = batch
+        pose = pose.cuda().float()
+        pose_lower = pose[..., HML_LOWER_BODY_MASK]
+        bs, seq = pose.shape[:2]
+        num_joints = 21 if pose.shape[-1] == 251 else 22
+        text = clip.tokenize(clip_text, truncate=True).cuda()
+        feat_clip_text, word_emb = clip_model(text)
+        motion_multimodality_batch = []
+        m_tokens_len = torch.ceil((m_length)/4)
+        pred_len = m_length.cuda()
+        pred_tok_len = m_tokens_len
+        max_motion_length = int(seq/4) + 1
+        mot_end_idx = get_model(net).vqvae.num_code
+        mot_pad_idx = get_model(net).vqvae.num_code + 1
+        target_lower = []
+        for k in range(bs):
+            target = net(pose[k:k+1, :m_length[k]], type='encode')
+            if m_tokens_len[k]+1 < max_motion_length:
+                target = torch.cat([target,
+                                    torch.ones((1, 1, 2), dtype=int, device=target.device) * mot_end_idx,
+                                    torch.ones((1, max_motion_length-1-m_tokens_len[k].int().item(), 2), dtype=int, device=target.device) * mot_pad_idx], axis=1)
+            else:
+                target = torch.cat([target,
+                                    torch.ones((1, 1, 2), dtype=int, device=target.device) * mot_end_idx], axis=1)
+            target_lower.append(target[..., 1])
+        target_lower = torch.cat(target_lower, axis=0)
+        for i in range(num_repeat):
+            pred_pose_eval = torch.zeros((bs, seq, pose.shape[-1])).cuda()
+            # pred_len = torch.ones(bs).long()
+            index_motion = trans(feat_clip_text, target_lower, word_emb, type="sample", m_length=pred_len, rand_pos=rand_pos, CFG=CFG)
+            # [INFO] 1. this get the last index of blank_id
+            # pred_length = (index_motion == blank_id).int().argmax(1).float()
+            # [INFO] 2. this get the first index of blank_id
+            pred_length = (index_motion >= blank_id).int()
+            pred_length = torch.topk(pred_length, k=1, dim=1).indices.squeeze().float()
+            # pred_length[pred_length==0] = index_motion.shape[1] # if blank_id in the first frame, set length to max
+            # [INFO] need to run single sample at a time b/c it's conv
+            for k in range(bs):
+            ######### [INFO] Eval only the predicted length
+            #     if pred_length[k] == 0:
+            #         pred_len[k] = seq
+            #         continue
+            #     pred_pose = net(index_motion[k:k+1, :int(pred_length[k].item())], type='decode')
+            #     cur_len = pred_pose.shape[1]
+            #     pred_len[k] = min(cur_len, seq)
+            #     pred_pose_eval[k:k+1, :cur_len] = pred_pose[:, :seq]
+            # et_pred, em_pred = eval_wrapper.get_co_embeddings(word_embeddings, pos_one_hots, sent_len, pred_pose_eval, pred_len)
+            ######################################################
+            ######### [INFO] Eval by m_length
+                all_tokens = torch.cat([
+                    index_motion[k:k+1, :int(pred_tok_len[k].item()), None],
+                    target_lower[k:k+1, :int(pred_tok_len[k].item()), None]
+                ], axis=-1)
+                pred_pose = net(all_tokens, type='decode')
+                pred_pose_eval[k:k+1, :int(pred_len[k].item())] = pred_pose
+            pred_pose_eval[..., HML_LOWER_BODY_MASK] = pose_lower
+            et_pred, em_pred = eval_wrapper.get_co_embeddings(word_embeddings, pos_one_hots, sent_len, pred_pose_eval, m_length)
+            ######################################################
+            motion_multimodality_batch.append(em_pred.reshape(bs, 1, -1))
+            if i == 0:
+                et, em = eval_wrapper.get_co_embeddings(word_embeddings, pos_one_hots, sent_len, pose, m_length)
+                motion_annotation_list.append(em)
+                motion_pred_list.append(em_pred)
+                # if draw:
+                #     pose = val_loader.dataset.inv_transform(pose.detach().cpu().numpy())
+                #     pose_xyz = recover_from_ric(torch.from_numpy(pose).float().cuda(), num_joints)
+                #     for j in range(min(4, bs)):
+                #         draw_org.append(pose_xyz[j][:m_length[j]].unsqueeze(0))
+                #         draw_text.append(clip_text[j])
+                temp_R, temp_match = calculate_R_precision(et.cpu().numpy(), em.cpu().numpy(), top_k=3, sum_all=True)
+                R_precision_real += temp_R
+                matching_score_real += temp_match
+                temp_R, temp_match = calculate_R_precision(et_pred.cpu().numpy(), em_pred.cpu().numpy(), top_k=3, sum_all=True)
+                R_precision += temp_R
+                matching_score_pred += temp_match
+                nb_sample += bs
+        motion_multimodality.append(torch.cat(motion_multimodality_batch, dim=1))
+    motion_annotation_np = torch.cat(motion_annotation_list, dim=0).cpu().numpy()
+    motion_pred_np = torch.cat(motion_pred_list, dim=0).cpu().numpy()
+    gt_mu, gt_cov  = calculate_activation_statistics(motion_annotation_np)
+    mu, cov= calculate_activation_statistics(motion_pred_np)
+    diversity_real = calculate_diversity(motion_annotation_np, 300 if nb_sample > 300 else 100)
+    diversity = calculate_diversity(motion_pred_np, 300 if nb_sample > 300 else 100)
+    R_precision_real = R_precision_real / nb_sample
+    R_precision = R_precision / nb_sample
+    matching_score_real = matching_score_real / nb_sample
+    matching_score_pred = matching_score_pred / nb_sample
+    multimodality = 0
+    motion_multimodality = torch.cat(motion_multimodality, dim=0).cpu().numpy()
+    if num_repeat > 1:
+        multimodality = calculate_multimodality(motion_multimodality, 10)
+    fid = calculate_frechet_distance(gt_mu, gt_cov, mu, cov)
+    msg = f"--> \t Eva. Iter {nb_iter} :, \n\
+                FID. {fid:.4f} , \n\
+                Diversity Real. {diversity_real:.4f}, \n\
+                Diversity. {diversity:.4f}, \n\
+                R_precision_real. {R_precision_real}, \n\
+                R_precision. {R_precision}, \n\
+                matching_score_real. {matching_score_real}, \n\
+                matching_score_pred. {matching_score_pred}, \n\
+                multimodality. {multimodality:.4f}"
+    logger.info(msg)
+    if draw:
+        writer.add_scalar('./Test/FID', fid, nb_iter)
+        writer.add_scalar('./Test/Diversity', diversity, nb_iter)
+        writer.add_scalar('./Test/top1', R_precision[0], nb_iter)
+        writer.add_scalar('./Test/top2', R_precision[1], nb_iter)
+        writer.add_scalar('./Test/top3', R_precision[2], nb_iter)
+        writer.add_scalar('./Test/matching_score', matching_score_pred, nb_iter)
+        writer.add_scalar('./Test/multimodality', multimodality, nb_iter)
+        # if nb_iter % 10000 == 0 :
+        #     for ii in range(4):
+        #         tensorborad_add_video_xyz(writer, draw_org[ii], nb_iter, tag='./Vis/org_eval'+str(ii), nb_vis=1, title_batch=[draw_text[ii]], outname=[os.path.join(out_dir, 'gt'+str(ii)+'.gif')] if savegif else None)
+        # if nb_iter % 10000 == 0 :
+        #     for ii in range(4):
+        #         tensorborad_add_video_xyz(writer, draw_pred[ii], nb_iter, tag='./Vis/pred_eval'+str(ii), nb_vis=1, title_batch=[draw_text_pred[ii]], outname=[os.path.join(out_dir, 'pred'+str(ii)+'.gif')] if savegif else None)
+    if fid < best_fid :
+        msg = f"--> --> \t FID Improved from {best_fid:.5f} to {fid:.5f} !!!"
+        logger.info(msg)
+        best_fid, best_iter = fid, nb_iter
+        # if save:
+        #     torch.save({'trans' : get_model(trans).state_dict()}, os.path.join(out_dir, 'net_best_fid.pth'))
+    if matching_score_pred < best_matching :
+        msg = f"--> --> \t matching_score Improved from {best_matching:.5f} to {matching_score_pred:.5f} !!!"
+        logger.info(msg)
+        best_matching = matching_score_pred
+    if abs(diversity_real - diversity) < abs(diversity_real - best_div) :
+        msg = f"--> --> \t Diversity Improved from {best_div:.5f} to {diversity:.5f} !!!"
+        logger.info(msg)
+        best_div = diversity
+    if R_precision[0] > best_top1 :
+        msg = f"--> --> \t Top1 Improved from {best_top1:.4f} to {R_precision[0]:.4f} !!!"
+        logger.info(msg)
+        best_top1 = R_precision[0]
+    if R_precision[1] > best_top2 :
+        msg = f"--> --> \t Top2 Improved from {best_top2:.4f} to {R_precision[1]:.4f} !!!"
+        logger.info(msg)
+        best_top2 = R_precision[1]
+    if R_precision[2] > best_top3 :
+        msg = f"--> --> \t Top3 Improved from {best_top3:.4f} to {R_precision[2]:.4f} !!!"
+        logger.info(msg)
+        best_top3 = R_precision[2]
+    if save:
+        torch.save({'trans' : get_model(trans).state_dict()}, os.path.join(out_dir, 'net_last.pth'))
+    trans.train()
+    return pred_pose_eval, pose, m_length, clip_text, best_fid, best_iter, best_div, best_top1, best_top2, best_top3, best_matching, multimodality, writer, logger
+@torch.no_grad()
+def evaluation_transformer_test(out_dir, val_loader, net, trans, logger, writer, nb_iter, best_fid, best_iter, best_div, best_top1, best_top2, best_top3, best_matching, best_multi, clip_model, eval_wrapper, draw = True, save = True, savegif=False, savenpy=False) :
+    trans.eval()
+    nb_sample = 0
+    draw_org = []
+    draw_pred = []
+    draw_text = []
+    draw_text_pred = []
+    draw_name = []
+    motion_annotation_list = []
+    motion_pred_list = []
+    motion_multimodality = []
+    R_precision_real = 0
+    R_precision = 0
+    matching_score_real = 0
+    matching_score_pred = 0
+    nb_sample = 0
+    for batch in val_loader:
+        word_embeddings, pos_one_hots, clip_text, sent_len, pose, m_length, token, name = batch
+        bs, seq = pose.shape[:2]
+        num_joints = 21 if pose.shape[-1] == 251 else 22
+        text = clip.tokenize(clip_text, truncate=True).cuda()
+        feat_clip_text = clip_model.encode_text(text).float()
+        motion_multimodality_batch = []
+        for i in range(30):
+            pred_pose_eval = torch.zeros((bs, seq, pose.shape[-1])).cuda()
+            pred_len = torch.ones(bs).long()
+            for k in range(bs):
+                try:
+                    index_motion = trans.sample(feat_clip_text[k:k+1], True)
+                except:
+                    index_motion = torch.ones(1,1).cuda().long()
+                pred_pose = net.forward_decoder(index_motion)
+                cur_len = pred_pose.shape[1]
+                pred_len[k] = min(cur_len, seq)
+                pred_pose_eval[k:k+1, :cur_len] = pred_pose[:, :seq]
+                if i == 0 and (draw or savenpy):
+                    pred_denorm = val_loader.dataset.inv_transform(pred_pose.detach().cpu().numpy())
+                    pred_xyz = recover_from_ric(torch.from_numpy(pred_denorm).float().cuda(), num_joints)
+                    if savenpy:
+                        np.save(os.path.join(out_dir, name[k]+'_pred.npy'), pred_xyz.detach().cpu().numpy())
+                    if draw:
+                        if i == 0:
+                            draw_pred.append(pred_xyz)
+                            draw_text_pred.append(clip_text[k])
+                            draw_name.append(name[k])
+            et_pred, em_pred = eval_wrapper.get_co_embeddings(word_embeddings, pos_one_hots, sent_len, pred_pose_eval, pred_len)
+            motion_multimodality_batch.append(em_pred.reshape(bs, 1, -1))
+            if i == 0:
+                pose = pose.cuda().float()
+                et, em = eval_wrapper.get_co_embeddings(word_embeddings, pos_one_hots, sent_len, pose, m_length)
+                motion_annotation_list.append(em)
+                motion_pred_list.append(em_pred)
+                if draw or savenpy:
+                    pose = val_loader.dataset.inv_transform(pose.detach().cpu().numpy())
+                    pose_xyz = recover_from_ric(torch.from_numpy(pose).float().cuda(), num_joints)
+                    if savenpy:
+                        for j in range(bs):
+                            np.save(os.path.join(out_dir, name[j]+'_gt.npy'), pose_xyz[j][:m_length[j]].unsqueeze(0).cpu().numpy())
+                    if draw:
+                        for j in range(bs):
+                            draw_org.append(pose_xyz[j][:m_length[j]].unsqueeze(0))
+                            draw_text.append(clip_text[j])
+                temp_R, temp_match = calculate_R_precision(et.cpu().numpy(), em.cpu().numpy(), top_k=3, sum_all=True)
+                R_precision_real += temp_R
+                matching_score_real += temp_match
+                temp_R, temp_match = calculate_R_precision(et_pred.cpu().numpy(), em_pred.cpu().numpy(), top_k=3, sum_all=True)
+                R_precision += temp_R
+                matching_score_pred += temp_match
+                nb_sample += bs
+        motion_multimodality.append(torch.cat(motion_multimodality_batch, dim=1))
+    motion_annotation_np = torch.cat(motion_annotation_list, dim=0).cpu().numpy()
+    motion_pred_np = torch.cat(motion_pred_list, dim=0).cpu().numpy()
+    gt_mu, gt_cov  = calculate_activation_statistics(motion_annotation_np)
+    mu, cov= calculate_activation_statistics(motion_pred_np)
+    diversity_real = calculate_diversity(motion_annotation_np, 300 if nb_sample > 300 else 100)
+    diversity = calculate_diversity(motion_pred_np, 300 if nb_sample > 300 else 100)
+    R_precision_real = R_precision_real / nb_sample
+    R_precision = R_precision / nb_sample
+    matching_score_real = matching_score_real / nb_sample
+    matching_score_pred = matching_score_pred / nb_sample
+    multimodality = 0
+    motion_multimodality = torch.cat(motion_multimodality, dim=0).cpu().numpy()
+    multimodality = calculate_multimodality(motion_multimodality, 10)
+    fid = calculate_frechet_distance(gt_mu, gt_cov, mu, cov)
+    msg = f"--> \t Eva. Iter {nb_iter} :, FID. {fid:.4f}, Diversity Real. {diversity_real:.4f}, Diversity. {diversity:.4f}, R_precision_real. {R_precision_real}, R_precision. {R_precision}, matching_score_real. {matching_score_real}, matching_score_pred. {matching_score_pred}, multimodality. {multimodality:.4f}"
+    logger.info(msg)
+    if draw:
+        for ii in range(len(draw_org)):
+            tensorborad_add_video_xyz(writer, draw_org[ii], nb_iter, tag='./Vis/'+draw_name[ii]+'_org', nb_vis=1, title_batch=[draw_text[ii]], outname=[os.path.join(out_dir, draw_name[ii]+'_skel_gt.gif')] if savegif else None)
+            tensorborad_add_video_xyz(writer, draw_pred[ii], nb_iter, tag='./Vis/'+draw_name[ii]+'_pred', nb_vis=1, title_batch=[draw_text_pred[ii]], outname=[os.path.join(out_dir, draw_name[ii]+'_skel_pred.gif')] if savegif else None)
+    trans.train()
+    return fid, best_iter, diversity, R_precision[0], R_precision[1], R_precision[2], matching_score_pred, multimodality, writer, logger
+# (X - X_train)*(X - X_train) = -2X*X_train + X*X + X_train*X_train
+def euclidean_distance_matrix(matrix1, matrix2):
+    """
+        Params:
+        -- matrix1: N1 x D
+        -- matrix2: N2 x D
+        Returns:
+        -- dist: N1 x N2
+        dist[i, j] == distance(matrix1[i], matrix2[j])
+    """
+    assert matrix1.shape[1] == matrix2.shape[1]
+    d1 = -2 * np.dot(matrix1, matrix2.T)    # shape (num_test, num_train)
+    d2 = np.sum(np.square(matrix1), axis=1, keepdims=True)    # shape (num_test, 1)
+    d3 = np.sum(np.square(matrix2), axis=1)     # shape (num_train, )
+    dists = np.sqrt(d1 + d2 + d3)  # broadcasting
+    return dists
+def calculate_top_k(mat, top_k):
+    size = mat.shape[0]
+    gt_mat = np.expand_dims(np.arange(size), 1).repeat(size, 1)
+    bool_mat = (mat == gt_mat)
+    correct_vec = False
+    top_k_list = []
+    for i in range(top_k):
+#         print(correct_vec, bool_mat[:, i])
+        correct_vec = (correct_vec | bool_mat[:, i])
+        # print(correct_vec)
+        top_k_list.append(correct_vec[:, None])
+    top_k_mat = np.concatenate(top_k_list, axis=1)
+    return top_k_mat
+def calculate_R_precision(embedding1, embedding2, top_k, sum_all=False):
+    dist_mat = euclidean_distance_matrix(embedding1, embedding2)
+    matching_score = dist_mat.trace()
+    argmax = np.argsort(dist_mat, axis=1)
+    top_k_mat = calculate_top_k(argmax, top_k)
+    if sum_all:
+        return top_k_mat.sum(axis=0), matching_score
+    else:
+        return top_k_mat, matching_score
+def calculate_multimodality(activation, multimodality_times):
+    assert len(activation.shape) == 3
+    assert activation.shape[1] > multimodality_times
+    num_per_sent = activation.shape[1]
+    first_dices = np.random.choice(num_per_sent, multimodality_times, replace=False)
+    second_dices = np.random.choice(num_per_sent, multimodality_times, replace=False)
+    dist = linalg.norm(activation[:, first_dices] - activation[:, second_dices], axis=2)
+    return dist.mean()
+def calculate_diversity(activation, diversity_times):
+    assert len(activation.shape) == 2
+    assert activation.shape[0] > diversity_times
+    num_samples = activation.shape[0]
+    first_indices = np.random.choice(num_samples, diversity_times, replace=False)
+    second_indices = np.random.choice(num_samples, diversity_times, replace=False)
+    dist = linalg.norm(activation[first_indices] - activation[second_indices], axis=1)
+    return dist.mean()
+def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    mu1 = np.atleast_1d(mu1)
+    mu2 = np.atleast_1d(mu2)
+    sigma1 = np.atleast_2d(sigma1)
+    sigma2 = np.atleast_2d(sigma2)
+    assert mu1.shape == mu2.shape, \
+        'Training and test mean vectors have different lengths'
+    assert sigma1.shape == sigma2.shape, \
+        'Training and test covariances have different dimensions'
+    diff = mu1 - mu2
+    # Product might be almost singular
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        msg = ('fid calculation produces singular product; '
+               'adding %s to diagonal of cov estimates') % eps
+        print(msg)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+    # Numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError('Imaginary component {}'.format(m))
+        covmean = covmean.real
+    tr_covmean = np.trace(covmean)
+    return (diff.dot(diff) + np.trace(sigma1)
+            + np.trace(sigma2) - 2 * tr_covmean)
+def calculate_activation_statistics(activations):
+    mu = np.mean(activations, axis=0)
+    cov = np.cov(activations, rowvar=False)
+    return mu, cov
+def calculate_frechet_feature_distance(feature_list1, feature_list2):
+    feature_list1 = np.stack(feature_list1)
+    feature_list2 = np.stack(feature_list2)
+    # normalize the scale
+    mean = np.mean(feature_list1, axis=0)
+    std = np.std(feature_list1, axis=0) + 1e-10
+    feature_list1 = (feature_list1 - mean) / std
+    feature_list2 = (feature_list2 - mean) / std
+    dist = calculate_frechet_distance(
+        mu1=np.mean(feature_list1, axis=0),
+        sigma1=np.cov(feature_list1, rowvar=False),
+        mu2=np.mean(feature_list2, axis=0),
+        sigma2=np.cov(feature_list2, rowvar=False),
+    )
+    return dist

utils/humanml_utils.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import numpy as np
+HML_JOINT_NAMES = [
+    'pelvis',
+    'left_hip',
+    'right_hip',
+    'spine1',
+    'left_knee',
+    'right_knee',
+    'spine2',
+    'left_ankle',
+    'right_ankle',
+    'spine3',
+    'left_foot',
+    'right_foot',
+    'neck',
+    'left_collar',
+    'right_collar',
+    'head',
+    'left_shoulder',
+    'right_shoulder',
+    'left_elbow',
+    'right_elbow',
+    'left_wrist',
+    'right_wrist',
+]
+NUM_HML_JOINTS = len(HML_JOINT_NAMES)  # 22 SMPLH body joints
+HML_LOWER_BODY_JOINTS = [HML_JOINT_NAMES.index(name) for name in ['pelvis', 'left_hip', 'right_hip', 'left_knee', 'right_knee', 'left_ankle', 'right_ankle', 'left_foot', 'right_foot',]]
+SMPL_UPPER_BODY_JOINTS = [i for i in range(len(HML_JOINT_NAMES)) if i not in HML_LOWER_BODY_JOINTS]
+# Recover global angle and positions for rotation data
+# root_rot_velocity (B, seq_len, 1)
+# root_linear_velocity (B, seq_len, 2)
+# root_y (B, seq_len, 1)
+# ric_data (B, seq_len, (joint_num - 1)*3)
+# rot_data (B, seq_len, (joint_num - 1)*6)
+# local_velocity (B, seq_len, joint_num*3)
+# foot contact (B, seq_len, 4)
+HML_ROOT_BINARY = np.array([True] + [False] * (NUM_HML_JOINTS-1))
+HML_ROOT_MASK = np.concatenate(([True]*(1+2+1),
+                                HML_ROOT_BINARY[1:].repeat(3),
+                                HML_ROOT_BINARY[1:].repeat(6),
+                                HML_ROOT_BINARY.repeat(3),
+                                [False] * 4))
+HML_LOWER_BODY_JOINTS_BINARY = np.array([i in HML_LOWER_BODY_JOINTS for i in range(NUM_HML_JOINTS)])
+HML_LOWER_BODY_MASK = np.concatenate(([True]*(1+2+1),
+                                     HML_LOWER_BODY_JOINTS_BINARY[1:].repeat(3),
+                                     HML_LOWER_BODY_JOINTS_BINARY[1:].repeat(6),
+                                     HML_LOWER_BODY_JOINTS_BINARY.repeat(3),
+                                     [True]*4))
+HML_UPPER_BODY_MASK = ~HML_LOWER_BODY_MASK
+ALL_JOINT_FALSE = np.full(*HML_ROOT_BINARY.shape, False)
+HML_UPPER_BODY_JOINTS_BINARY = np.array([i in SMPL_UPPER_BODY_JOINTS for i in range(NUM_HML_JOINTS)])
+UPPER_JOINT_Y_TRUE = np.array([ALL_JOINT_FALSE[1:], HML_UPPER_BODY_JOINTS_BINARY[1:], ALL_JOINT_FALSE[1:]])
+UPPER_JOINT_Y_TRUE = UPPER_JOINT_Y_TRUE.T
+UPPER_JOINT_Y_TRUE = UPPER_JOINT_Y_TRUE.reshape(ALL_JOINT_FALSE[1:].shape[0]*3)
+UPPER_JOINT_Y_MASK = np.concatenate(([False]*(1+2+1),
+                                UPPER_JOINT_Y_TRUE,
+                                ALL_JOINT_FALSE[1:].repeat(6),
+                                ALL_JOINT_FALSE.repeat(3),
+                                [False] * 4))

utils/losses.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+import torch.nn as nn
+class ReConsLoss(nn.Module):
+    def __init__(self, recons_loss, nb_joints):
+        super(ReConsLoss, self).__init__()
+        if recons_loss == 'l1':
+            self.Loss = torch.nn.L1Loss()
+        elif recons_loss == 'l2' :
+            self.Loss = torch.nn.MSELoss()
+        elif recons_loss == 'l1_smooth' :
+            self.Loss = torch.nn.SmoothL1Loss()
+        # 4 global motion associated to root
+        # 12 local motion (3 local xyz, 3 vel xyz, 6 rot6d)
+        # 3 global vel xyz
+        # 4 foot contact
+        self.nb_joints = nb_joints
+        self.motion_dim = (nb_joints - 1) * 12 + 4 + 3 + 4
+    def forward(self, motion_pred, motion_gt) :
+        loss = self.Loss(motion_pred[..., : self.motion_dim], motion_gt[..., :self.motion_dim])
+        return loss
+    def forward_joint(self, motion_pred, motion_gt) :
+        loss = self.Loss(motion_pred[..., 4 : (self.nb_joints - 1) * 3 + 4], motion_gt[..., 4 : (self.nb_joints - 1) * 3 + 4])
+        return loss

utils/motion_process.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import torch
+from utils.quaternion import quaternion_to_cont6d, qrot, qinv
+def recover_root_rot_pos(data):
+    rot_vel = data[..., 0]
+    r_rot_ang = torch.zeros_like(rot_vel).to(data.device)
+    '''Get Y-axis rotation from rotation velocity'''
+    r_rot_ang[..., 1:] = rot_vel[..., :-1]
+    r_rot_ang = torch.cumsum(r_rot_ang, dim=-1)
+    r_rot_quat = torch.zeros(data.shape[:-1] + (4,)).to(data.device)
+    r_rot_quat[..., 0] = torch.cos(r_rot_ang)
+    r_rot_quat[..., 2] = torch.sin(r_rot_ang)
+    r_pos = torch.zeros(data.shape[:-1] + (3,)).to(data.device)
+    r_pos[..., 1:, [0, 2]] = data[..., :-1, 1:3]
+    '''Add Y-axis rotation to root position'''
+    r_pos = qrot(qinv(r_rot_quat), r_pos)
+    r_pos = torch.cumsum(r_pos, dim=-2)
+    r_pos[..., 1] = data[..., 3]
+    return r_rot_quat, r_pos
+def recover_from_rot(data, joints_num, skeleton):
+    r_rot_quat, r_pos = recover_root_rot_pos(data)
+    r_rot_cont6d = quaternion_to_cont6d(r_rot_quat)
+    start_indx = 1 + 2 + 1 + (joints_num - 1) * 3
+    end_indx = start_indx + (joints_num - 1) * 6
+    cont6d_params = data[..., start_indx:end_indx]
+    #     print(r_rot_cont6d.shape, cont6d_params.shape, r_pos.shape)
+    cont6d_params = torch.cat([r_rot_cont6d, cont6d_params], dim=-1)
+    cont6d_params = cont6d_params.view(-1, joints_num, 6)
+    positions = skeleton.forward_kinematics_cont6d(cont6d_params, r_pos)
+    return positions
+def recover_from_ric(data, joints_num):
+    r_rot_quat, r_pos = recover_root_rot_pos(data)
+    positions = data[..., 4:(joints_num - 1) * 3 + 4]
+    positions = positions.view(positions.shape[:-1] + (-1, 3))
+    '''Add Y-axis rotation to local joints'''
+    positions = qrot(qinv(r_rot_quat[..., None, :]).expand(positions.shape[:-1] + (4,)), positions)
+    '''Add root XZ to joints'''
+    positions[..., 0] += r_pos[..., 0:1]
+    positions[..., 2] += r_pos[..., 2:3]
+    '''Concate root and joints'''
+    positions = torch.cat([r_pos.unsqueeze(-2), positions], dim=-2)
+    return positions

utils/paramUtil.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import numpy as np
+# Define a kinematic tree for the skeletal struture
+kit_kinematic_chain = [[0, 11, 12, 13, 14, 15], [0, 16, 17, 18, 19, 20], [0, 1, 2, 3, 4], [3, 5, 6, 7], [3, 8, 9, 10]]
+kit_raw_offsets = np.array(
+    [
+        [0, 0, 0],
+        [0, 1, 0],
+        [0, 1, 0],
+        [0, 1, 0],
+        [0, 1, 0],
+        [1, 0, 0],
+        [0, -1, 0],
+        [0, -1, 0],
+        [-1, 0, 0],
+        [0, -1, 0],
+        [0, -1, 0],
+        [1, 0, 0],
+        [0, -1, 0],
+        [0, -1, 0],
+        [0, 0, 1],
+        [0, 0, 1],
+        [-1, 0, 0],
+        [0, -1, 0],
+        [0, -1, 0],
+        [0, 0, 1],
+        [0, 0, 1]
+    ]
+)
+t2m_raw_offsets = np.array([[0,0,0],
+                           [1,0,0],
+                           [-1,0,0],
+                           [0,1,0],
+                           [0,-1,0],
+                           [0,-1,0],
+                           [0,1,0],
+                           [0,-1,0],
+                           [0,-1,0],
+                           [0,1,0],
+                           [0,0,1],
+                           [0,0,1],
+                           [0,1,0],
+                           [1,0,0],
+                           [-1,0,0],
+                           [0,0,1],
+                           [0,-1,0],
+                           [0,-1,0],
+                           [0,-1,0],
+                           [0,-1,0],
+                           [0,-1,0],
+                           [0,-1,0]])
+t2m_kinematic_chain = [[0, 2, 5, 8, 11], [0, 1, 4, 7, 10], [0, 3, 6, 9, 12, 15], [9, 14, 17, 19, 21], [9, 13, 16, 18, 20]]
+t2m_left_hand_chain = [[20, 22, 23, 24], [20, 34, 35, 36], [20, 25, 26, 27], [20, 31, 32, 33], [20, 28, 29, 30]]
+t2m_right_hand_chain = [[21, 43, 44, 45], [21, 46, 47, 48], [21, 40, 41, 42], [21, 37, 38, 39], [21, 49, 50, 51]]
+kit_tgt_skel_id = '03950'
+t2m_tgt_skel_id = '000021'

utils/quaternion.py ADDED Viewed

	@@ -0,0 +1,423 @@

+# Copyright (c) 2018-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+import torch
+import numpy as np
+_EPS4 = np.finfo(float).eps * 4.0
+_FLOAT_EPS = np.finfo(float).eps
+# PyTorch-backed implementations
+def qinv(q):
+    assert q.shape[-1] == 4, 'q must be a tensor of shape (*, 4)'
+    mask = torch.ones_like(q)
+    mask[..., 1:] = -mask[..., 1:]
+    return q * mask
+def qinv_np(q):
+    assert q.shape[-1] == 4, 'q must be a tensor of shape (*, 4)'
+    return qinv(torch.from_numpy(q).float()).numpy()
+def qnormalize(q):
+    assert q.shape[-1] == 4, 'q must be a tensor of shape (*, 4)'
+    return q / torch.norm(q, dim=-1, keepdim=True)
+def qmul(q, r):
+    """
+    Multiply quaternion(s) q with quaternion(s) r.
+    Expects two equally-sized tensors of shape (*, 4), where * denotes any number of dimensions.
+    Returns q*r as a tensor of shape (*, 4).
+    """
+    assert q.shape[-1] == 4
+    assert r.shape[-1] == 4
+    original_shape = q.shape
+    # Compute outer product
+    terms = torch.bmm(r.view(-1, 4, 1), q.view(-1, 1, 4))
+    w = terms[:, 0, 0] - terms[:, 1, 1] - terms[:, 2, 2] - terms[:, 3, 3]
+    x = terms[:, 0, 1] + terms[:, 1, 0] - terms[:, 2, 3] + terms[:, 3, 2]
+    y = terms[:, 0, 2] + terms[:, 1, 3] + terms[:, 2, 0] - terms[:, 3, 1]
+    z = terms[:, 0, 3] - terms[:, 1, 2] + terms[:, 2, 1] + terms[:, 3, 0]
+    return torch.stack((w, x, y, z), dim=1).view(original_shape)
+def qrot(q, v):
+    """
+    Rotate vector(s) v about the rotation described by quaternion(s) q.
+    Expects a tensor of shape (*, 4) for q and a tensor of shape (*, 3) for v,
+    where * denotes any number of dimensions.
+    Returns a tensor of shape (*, 3).
+    """
+    assert q.shape[-1] == 4
+    assert v.shape[-1] == 3
+    assert q.shape[:-1] == v.shape[:-1]
+    original_shape = list(v.shape)
+    # print(q.shape)
+    q = q.contiguous().view(-1, 4)
+    v = v.contiguous().view(-1, 3)
+    qvec = q[:, 1:]
+    uv = torch.cross(qvec, v, dim=1)
+    uuv = torch.cross(qvec, uv, dim=1)
+    return (v + 2 * (q[:, :1] * uv + uuv)).view(original_shape)
+def qeuler(q, order, epsilon=0, deg=True):
+    """
+    Convert quaternion(s) q to Euler angles.
+    Expects a tensor of shape (*, 4), where * denotes any number of dimensions.
+    Returns a tensor of shape (*, 3).
+    """
+    assert q.shape[-1] == 4
+    original_shape = list(q.shape)
+    original_shape[-1] = 3
+    q = q.view(-1, 4)
+    q0 = q[:, 0]
+    q1 = q[:, 1]
+    q2 = q[:, 2]
+    q3 = q[:, 3]
+    if order == 'xyz':
+        x = torch.atan2(2 * (q0 * q1 - q2 * q3), 1 - 2 * (q1 * q1 + q2 * q2))
+        y = torch.asin(torch.clamp(2 * (q1 * q3 + q0 * q2), -1 + epsilon, 1 - epsilon))
+        z = torch.atan2(2 * (q0 * q3 - q1 * q2), 1 - 2 * (q2 * q2 + q3 * q3))
+    elif order == 'yzx':
+        x = torch.atan2(2 * (q0 * q1 - q2 * q3), 1 - 2 * (q1 * q1 + q3 * q3))
+        y = torch.atan2(2 * (q0 * q2 - q1 * q3), 1 - 2 * (q2 * q2 + q3 * q3))
+        z = torch.asin(torch.clamp(2 * (q1 * q2 + q0 * q3), -1 + epsilon, 1 - epsilon))
+    elif order == 'zxy':
+        x = torch.asin(torch.clamp(2 * (q0 * q1 + q2 * q3), -1 + epsilon, 1 - epsilon))
+        y = torch.atan2(2 * (q0 * q2 - q1 * q3), 1 - 2 * (q1 * q1 + q2 * q2))
+        z = torch.atan2(2 * (q0 * q3 - q1 * q2), 1 - 2 * (q1 * q1 + q3 * q3))
+    elif order == 'xzy':
+        x = torch.atan2(2 * (q0 * q1 + q2 * q3), 1 - 2 * (q1 * q1 + q3 * q3))
+        y = torch.atan2(2 * (q0 * q2 + q1 * q3), 1 - 2 * (q2 * q2 + q3 * q3))
+        z = torch.asin(torch.clamp(2 * (q0 * q3 - q1 * q2), -1 + epsilon, 1 - epsilon))
+    elif order == 'yxz':
+        x = torch.asin(torch.clamp(2 * (q0 * q1 - q2 * q3), -1 + epsilon, 1 - epsilon))
+        y = torch.atan2(2 * (q1 * q3 + q0 * q2), 1 - 2 * (q1 * q1 + q2 * q2))
+        z = torch.atan2(2 * (q1 * q2 + q0 * q3), 1 - 2 * (q1 * q1 + q3 * q3))
+    elif order == 'zyx':
+        x = torch.atan2(2 * (q0 * q1 + q2 * q3), 1 - 2 * (q1 * q1 + q2 * q2))
+        y = torch.asin(torch.clamp(2 * (q0 * q2 - q1 * q3), -1 + epsilon, 1 - epsilon))
+        z = torch.atan2(2 * (q0 * q3 + q1 * q2), 1 - 2 * (q2 * q2 + q3 * q3))
+    else:
+        raise
+    if deg:
+        return torch.stack((x, y, z), dim=1).view(original_shape) * 180 / np.pi
+    else:
+        return torch.stack((x, y, z), dim=1).view(original_shape)
+# Numpy-backed implementations
+def qmul_np(q, r):
+    q = torch.from_numpy(q).contiguous().float()
+    r = torch.from_numpy(r).contiguous().float()
+    return qmul(q, r).numpy()
+def qrot_np(q, v):
+    q = torch.from_numpy(q).contiguous().float()
+    v = torch.from_numpy(v).contiguous().float()
+    return qrot(q, v).numpy()
+def qeuler_np(q, order, epsilon=0, use_gpu=False):
+    if use_gpu:
+        q = torch.from_numpy(q).cuda().float()
+        return qeuler(q, order, epsilon).cpu().numpy()
+    else:
+        q = torch.from_numpy(q).contiguous().float()
+        return qeuler(q, order, epsilon).numpy()
+def qfix(q):
+    """
+    Enforce quaternion continuity across the time dimension by selecting
+    the representation (q or -q) with minimal distance (or, equivalently, maximal dot product)
+    between two consecutive frames.
+    Expects a tensor of shape (L, J, 4), where L is the sequence length and J is the number of joints.
+    Returns a tensor of the same shape.
+    """
+    assert len(q.shape) == 3
+    assert q.shape[-1] == 4
+    result = q.copy()
+    dot_products = np.sum(q[1:] * q[:-1], axis=2)
+    mask = dot_products < 0
+    mask = (np.cumsum(mask, axis=0) % 2).astype(bool)
+    result[1:][mask] *= -1
+    return result
+def euler2quat(e, order, deg=True):
+    """
+    Convert Euler angles to quaternions.
+    """
+    assert e.shape[-1] == 3
+    original_shape = list(e.shape)
+    original_shape[-1] = 4
+    e = e.view(-1, 3)
+    ## if euler angles in degrees
+    if deg:
+        e = e * np.pi / 180.
+    x = e[:, 0]
+    y = e[:, 1]
+    z = e[:, 2]
+    rx = torch.stack((torch.cos(x / 2), torch.sin(x / 2), torch.zeros_like(x), torch.zeros_like(x)), dim=1)
+    ry = torch.stack((torch.cos(y / 2), torch.zeros_like(y), torch.sin(y / 2), torch.zeros_like(y)), dim=1)
+    rz = torch.stack((torch.cos(z / 2), torch.zeros_like(z), torch.zeros_like(z), torch.sin(z / 2)), dim=1)
+    result = None
+    for coord in order:
+        if coord == 'x':
+            r = rx
+        elif coord == 'y':
+            r = ry
+        elif coord == 'z':
+            r = rz
+        else:
+            raise
+        if result is None:
+            result = r
+        else:
+            result = qmul(result, r)
+    # Reverse antipodal representation to have a non-negative "w"
+    if order in ['xyz', 'yzx', 'zxy']:
+        result *= -1
+    return result.view(original_shape)
+def expmap_to_quaternion(e):
+    """
+    Convert axis-angle rotations (aka exponential maps) to quaternions.
+    Stable formula from "Practical Parameterization of Rotations Using the Exponential Map".
+    Expects a tensor of shape (*, 3), where * denotes any number of dimensions.
+    Returns a tensor of shape (*, 4).
+    """
+    assert e.shape[-1] == 3
+    original_shape = list(e.shape)
+    original_shape[-1] = 4
+    e = e.reshape(-1, 3)
+    theta = np.linalg.norm(e, axis=1).reshape(-1, 1)
+    w = np.cos(0.5 * theta).reshape(-1, 1)
+    xyz = 0.5 * np.sinc(0.5 * theta / np.pi) * e
+    return np.concatenate((w, xyz), axis=1).reshape(original_shape)
+def euler_to_quaternion(e, order):
+    """
+    Convert Euler angles to quaternions.
+    """
+    assert e.shape[-1] == 3
+    original_shape = list(e.shape)
+    original_shape[-1] = 4
+    e = e.reshape(-1, 3)
+    x = e[:, 0]
+    y = e[:, 1]
+    z = e[:, 2]
+    rx = np.stack((np.cos(x / 2), np.sin(x / 2), np.zeros_like(x), np.zeros_like(x)), axis=1)
+    ry = np.stack((np.cos(y / 2), np.zeros_like(y), np.sin(y / 2), np.zeros_like(y)), axis=1)
+    rz = np.stack((np.cos(z / 2), np.zeros_like(z), np.zeros_like(z), np.sin(z / 2)), axis=1)
+    result = None
+    for coord in order:
+        if coord == 'x':
+            r = rx
+        elif coord == 'y':
+            r = ry
+        elif coord == 'z':
+            r = rz
+        else:
+            raise
+        if result is None:
+            result = r
+        else:
+            result = qmul_np(result, r)
+    # Reverse antipodal representation to have a non-negative "w"
+    if order in ['xyz', 'yzx', 'zxy']:
+        result *= -1
+    return result.reshape(original_shape)
+def quaternion_to_matrix(quaternions):
+    """
+    Convert rotations given as quaternions to rotation matrices.
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    r, i, j, k = torch.unbind(quaternions, -1)
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+def quaternion_to_matrix_np(quaternions):
+    q = torch.from_numpy(quaternions).contiguous().float()
+    return quaternion_to_matrix(q).numpy()
+def quaternion_to_cont6d_np(quaternions):
+    rotation_mat = quaternion_to_matrix_np(quaternions)
+    cont_6d = np.concatenate([rotation_mat[..., 0], rotation_mat[..., 1]], axis=-1)
+    return cont_6d
+def quaternion_to_cont6d(quaternions):
+    rotation_mat = quaternion_to_matrix(quaternions)
+    cont_6d = torch.cat([rotation_mat[..., 0], rotation_mat[..., 1]], dim=-1)
+    return cont_6d
+def cont6d_to_matrix(cont6d):
+    assert cont6d.shape[-1] == 6, "The last dimension must be 6"
+    x_raw = cont6d[..., 0:3]
+    y_raw = cont6d[..., 3:6]
+    x = x_raw / torch.norm(x_raw, dim=-1, keepdim=True)
+    z = torch.cross(x, y_raw, dim=-1)
+    z = z / torch.norm(z, dim=-1, keepdim=True)
+    y = torch.cross(z, x, dim=-1)
+    x = x[..., None]
+    y = y[..., None]
+    z = z[..., None]
+    mat = torch.cat([x, y, z], dim=-1)
+    return mat
+def cont6d_to_matrix_np(cont6d):
+    q = torch.from_numpy(cont6d).contiguous().float()
+    return cont6d_to_matrix(q).numpy()
+def qpow(q0, t, dtype=torch.float):
+    ''' q0 : tensor of quaternions
+    t: tensor of powers
+    '''
+    q0 = qnormalize(q0)
+    theta0 = torch.acos(q0[..., 0])
+    ## if theta0 is close to zero, add epsilon to avoid NaNs
+    mask = (theta0 <= 10e-10) * (theta0 >= -10e-10)
+    theta0 = (1 - mask) * theta0 + mask * 10e-10
+    v0 = q0[..., 1:] / torch.sin(theta0).view(-1, 1)
+    if isinstance(t, torch.Tensor):
+        q = torch.zeros(t.shape + q0.shape)
+        theta = t.view(-1, 1) * theta0.view(1, -1)
+    else:  ## if t is a number
+        q = torch.zeros(q0.shape)
+        theta = t * theta0
+    q[..., 0] = torch.cos(theta)
+    q[..., 1:] = v0 * torch.sin(theta).unsqueeze(-1)
+    return q.to(dtype)
+def qslerp(q0, q1, t):
+    '''
+    q0: starting quaternion
+    q1: ending quaternion
+    t: array of points along the way
+    Returns:
+    Tensor of Slerps: t.shape + q0.shape
+    '''
+    q0 = qnormalize(q0)
+    q1 = qnormalize(q1)
+    q_ = qpow(qmul(q1, qinv(q0)), t)
+    return qmul(q_,
+                q0.contiguous().view(torch.Size([1] * len(t.shape)) + q0.shape).expand(t.shape + q0.shape).contiguous())
+def qbetween(v0, v1):
+    '''
+    find the quaternion used to rotate v0 to v1
+    '''
+    assert v0.shape[-1] == 3, 'v0 must be of the shape (*, 3)'
+    assert v1.shape[-1] == 3, 'v1 must be of the shape (*, 3)'
+    v = torch.cross(v0, v1)
+    w = torch.sqrt((v0 ** 2).sum(dim=-1, keepdim=True) * (v1 ** 2).sum(dim=-1, keepdim=True)) + (v0 * v1).sum(dim=-1,
+                                                                                                              keepdim=True)
+    return qnormalize(torch.cat([w, v], dim=-1))
+def qbetween_np(v0, v1):
+    '''
+    find the quaternion used to rotate v0 to v1
+    '''
+    assert v0.shape[-1] == 3, 'v0 must be of the shape (*, 3)'
+    assert v1.shape[-1] == 3, 'v1 must be of the shape (*, 3)'
+    v0 = torch.from_numpy(v0).float()
+    v1 = torch.from_numpy(v1).float()
+    return qbetween(v0, v1).numpy()
+def lerp(p0, p1, t):
+    if not isinstance(t, torch.Tensor):
+        t = torch.Tensor([t])
+    new_shape = t.shape + p0.shape
+    new_view_t = t.shape + torch.Size([1] * len(p0.shape))
+    new_view_p = torch.Size([1] * len(t.shape)) + p0.shape
+    p0 = p0.view(new_view_p).expand(new_shape)
+    p1 = p1.view(new_view_p).expand(new_shape)
+    t = t.view(new_view_t).expand(new_shape)
+    return p0 + t * (p1 - p0)

utils/utils_model.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import numpy as np
+import torch
+import torch.optim as optim
+import logging
+import os
+import sys
+def getCi(accLog):
+    mean = np.mean(accLog)
+    std = np.std(accLog)
+    ci95 = 1.96*std/np.sqrt(len(accLog))
+    return mean, ci95
+def get_logger(out_dir):
+    logger = logging.getLogger('Exp')
+    logger.setLevel(logging.INFO)
+    formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
+    file_path = os.path.join(out_dir, "run.log")
+    file_hdlr = logging.FileHandler(file_path)
+    file_hdlr.setFormatter(formatter)
+    strm_hdlr = logging.StreamHandler(sys.stdout)
+    strm_hdlr.setFormatter(formatter)
+    logger.addHandler(file_hdlr)
+    logger.addHandler(strm_hdlr)
+    return logger
+## Optimizer
+def initial_optim(decay_option, lr, weight_decay, net, optimizer) :
+    if optimizer == 'adamw' :
+        optimizer_adam_family = optim.AdamW
+    elif optimizer == 'adam' :
+        optimizer_adam_family = optim.Adam
+    if decay_option == 'all':
+        #optimizer = optimizer_adam_family(net.parameters(), lr=lr, betas=(0.9, 0.999), weight_decay=weight_decay)
+        optimizer = optimizer_adam_family(net.parameters(), lr=lr, betas=(0.5, 0.9), weight_decay=weight_decay)
+    elif decay_option == 'noVQ':
+        all_params = set(net.parameters())
+        no_decay = set([net.vq_layer])
+        decay = all_params - no_decay
+        optimizer = optimizer_adam_family([
+                    {'params': list(no_decay), 'weight_decay': 0},
+                    {'params': list(decay), 'weight_decay' : weight_decay}], lr=lr)
+    return optimizer
+def get_motion_with_trans(motion, velocity) :
+    '''
+    motion : torch.tensor, shape (batch_size, T, 72), with the global translation = 0
+    velocity : torch.tensor, shape (batch_size, T, 3), contain the information of velocity = 0
+    '''
+    trans = torch.cumsum(velocity, dim=1)
+    trans = trans - trans[:, :1] ## the first root is initialized at 0 (just for visualization)
+    trans = trans.repeat((1, 1, 21))
+    motion_with_trans = motion + trans
+    return motion_with_trans

utils/word_vectorizer.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import numpy as np
+import pickle
+from os.path import join as pjoin
+POS_enumerator = {
+    'VERB': 0,
+    'NOUN': 1,
+    'DET': 2,
+    'ADP': 3,
+    'NUM': 4,
+    'AUX': 5,
+    'PRON': 6,
+    'ADJ': 7,
+    'ADV': 8,
+    'Loc_VIP': 9,
+    'Body_VIP': 10,
+    'Obj_VIP': 11,
+    'Act_VIP': 12,
+    'Desc_VIP': 13,
+    'OTHER': 14,
+}
+Loc_list = ('left', 'right', 'clockwise', 'counterclockwise', 'anticlockwise', 'forward', 'back', 'backward',
+            'up', 'down', 'straight', 'curve')
+Body_list = ('arm', 'chin', 'foot', 'feet', 'face', 'hand', 'mouth', 'leg', 'waist', 'eye', 'knee', 'shoulder', 'thigh')
+Obj_List = ('stair', 'dumbbell', 'chair', 'window', 'floor', 'car', 'ball', 'handrail', 'baseball', 'basketball')
+Act_list = ('walk', 'run', 'swing', 'pick', 'bring', 'kick', 'put', 'squat', 'throw', 'hop', 'dance', 'jump', 'turn',
+            'stumble', 'dance', 'stop', 'sit', 'lift', 'lower', 'raise', 'wash', 'stand', 'kneel', 'stroll',
+            'rub', 'bend', 'balance', 'flap', 'jog', 'shuffle', 'lean', 'rotate', 'spin', 'spread', 'climb')
+Desc_list = ('slowly', 'carefully', 'fast', 'careful', 'slow', 'quickly', 'happy', 'angry', 'sad', 'happily',
+             'angrily', 'sadly')
+VIP_dict = {
+    'Loc_VIP': Loc_list,
+    'Body_VIP': Body_list,
+    'Obj_VIP': Obj_List,
+    'Act_VIP': Act_list,
+    'Desc_VIP': Desc_list,
+}
+class WordVectorizer(object):
+    def __init__(self, meta_root, prefix):
+        vectors = np.load(pjoin(meta_root, '%s_data.npy'%prefix))
+        words = pickle.load(open(pjoin(meta_root, '%s_words.pkl'%prefix), 'rb'))
+        self.word2idx = pickle.load(open(pjoin(meta_root, '%s_idx.pkl'%prefix), 'rb'))
+        self.word2vec = {w: vectors[self.word2idx[w]] for w in words}
+    def _get_pos_ohot(self, pos):
+        pos_vec = np.zeros(len(POS_enumerator))
+        if pos in POS_enumerator:
+            pos_vec[POS_enumerator[pos]] = 1
+        else:
+            pos_vec[POS_enumerator['OTHER']] = 1
+        return pos_vec
+    def __len__(self):
+        return len(self.word2vec)
+    def __getitem__(self, item):
+        word, pos = item.split('/')
+        if word in self.word2vec:
+            word_vec = self.word2vec[word]
+            vip_pos = None
+            for key, values in VIP_dict.items():
+                if word in values:
+                    vip_pos = key
+                    break
+            if vip_pos is not None:
+                pos_vec = self._get_pos_ohot(vip_pos)
+            else:
+                pos_vec = self._get_pos_ohot(pos)
+        else:
+            word_vec = self.word2vec['unk']
+            pos_vec = self._get_pos_ohot('OTHER')
+        return word_vec, pos_vec
+class WordVectorizerV2(WordVectorizer):
+    def __init__(self, meta_root, prefix):
+        super(WordVectorizerV2, self).__init__(meta_root, prefix)
+        self.idx2word = {self.word2idx[w]: w for w in self.word2idx}
+    def __getitem__(self, item):
+        word_vec, pose_vec = super(WordVectorizerV2, self).__getitem__(item)
+        word, pos = item.split('/')
+        if word in self.word2vec:
+            return word_vec, pose_vec, self.word2idx[word]
+        else:
+            return word_vec, pose_vec, self.word2idx['unk']
+    def itos(self, idx):
+        if idx == len(self.idx2word):
+            return "pad"
+        return self.idx2word[idx]