Spaces:

wxDai
/

MotionLCM

Runtime error

App Files Files Community

wxDai commited on Apr 25

Commit

6b1e9f7

•

0 Parent(s):

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +35 -0
.gitignore +10 -0
LICENSE +25 -0
README.md +14 -0
app.py +258 -0
configs/mld_control.yaml +105 -0
configs/mld_t2m_infer.yaml +72 -0
configs/modules/denoiser.yaml +16 -0
configs/modules/motion_vae.yaml +13 -0
configs/modules/scheduler.yaml +20 -0
configs/modules/text_encoder.yaml +5 -0
configs/modules/traj_encoder.yaml +12 -0
configs/modules_mld/denoiser.yaml +16 -0
configs/modules_mld/motion_vae.yaml +13 -0
configs/modules_mld/scheduler.yaml +23 -0
configs/modules_mld/text_encoder.yaml +5 -0
configs/modules_mld/traj_encoder.yaml +12 -0
configs/motionlcm_control.yaml +105 -0
configs/motionlcm_t2m.yaml +100 -0
demo.py +154 -0
fit.py +134 -0
mld/__init__.py +0 -0
mld/config.py +47 -0
mld/data/HumanML3D.py +79 -0
mld/data/Kit.py +79 -0
mld/data/__init__.py +0 -0
mld/data/base.py +65 -0
mld/data/get_data.py +93 -0
mld/data/humanml/__init__.py +0 -0
mld/data/humanml/common/quaternion.py +29 -0
mld/data/humanml/dataset.py +290 -0
mld/data/humanml/scripts/motion_process.py +51 -0
mld/data/humanml/utils/__init__.py +0 -0
mld/data/humanml/utils/paramUtil.py +62 -0
mld/data/humanml/utils/plot_script.py +98 -0
mld/data/humanml/utils/word_vectorizer.py +82 -0
mld/data/utils.py +38 -0
mld/launch/__init__.py +0 -0
mld/launch/blender.py +23 -0
mld/models/__init__.py +0 -0
mld/models/architectures/__init__.py +0 -0
mld/models/architectures/mld_clip.py +72 -0
mld/models/architectures/mld_denoiser.py +172 -0
mld/models/architectures/mld_traj_encoder.py +78 -0
mld/models/architectures/mld_vae.py +154 -0
mld/models/architectures/t2m_motionenc.py +58 -0
mld/models/architectures/t2m_textenc.py +43 -0
mld/models/architectures/tools/embeddings.py +89 -0
mld/models/metrics/__init__.py +3 -0
mld/models/metrics/cm.py +55 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+**/*.pyc
+.idea/
+__pycache__/
+deps/
+datasets/
+experiments_t2m/
+experiments_t2m_test/
+experiments_control/
+experiments_control_test/

LICENSE ADDED Viewed

	@@ -0,0 +1,25 @@

+Copyright Tsinghua University and Shanghai AI Laboratory. All Rights Reserved.
+License for Non-commercial Scientific Research Purposes.
+For more information see <https://github.com/Dai-Wenxun/MotionLCM>.
+If you use this software, please cite the corresponding publications
+listed on the above website.
+Permission to use, copy, modify, and distribute this software and its
+documentation for educational, research, and non-profit purposes only.
+Any modification based on this work must be open-source and prohibited
+for commercial, pornographic, military, or surveillance use.
+The authors grant you a non-exclusive, worldwide, non-transferable,
+non-sublicensable, revocable, royalty-free, and limited license under
+our copyright interests to reproduce, distribute, and create derivative
+works of the text, videos, and codes solely for your non-commercial
+research purposes.
+You must retain, in the source form of any derivative works that you
+distribute, all copyright, patent, trademark, and attribution notices
+from the source form of this work.
+For commercial uses of this software, please send email to all people
+in the author list.

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: MotionLCM
+emoji: 🏃
+colorFrom: yellow
+colorTo: gray
+sdk: gradio
+sdk_version: 3.24.1
+app_file: app.py
+pinned: false
+license: other
+python_version: 3.10.12
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import os
+import time
+import random
+import datetime
+import os.path as osp
+from functools import partial
+import torch
+import gradio as gr
+from omegaconf import OmegaConf
+from mld.config import get_module_config
+from mld.data.get_data import get_datasets
+from mld.models.modeltype.mld import MLD
+from mld.utils.utils import set_seed
+from mld.data.humanml.utils.plot_script import plot_3d_motion
+WEBSITE = """
+<div class="embed_hidden">
+<h1 style='text-align: center'> MotionLCM: Real-time Controllable Motion Generation via Latent Consistency Model </h1>
+<h2 style='text-align: center'>
+<a href="https://github.com/Dai-Wenxun/" target="_blank"><nobr>Wenxun Dai</nobr><sup>1</sup></a> &emsp;
+<a href="https://lhchen.top/" target="_blank"><nobr>Ling-Hao Chen</nobr></a><sup>1</sup> &emsp;
+<a href="https://wangjingbo1219.github.io/" target="_blank"><nobr>Jingbo Wang</nobr></a><sup>2</sup> &emsp;
+<a href="https://moonsliu.github.io/" target="_blank"><nobr>Jinpeng Liu</nobr></a><sup>1</sup> &emsp;
+<a href="https://daibo.info/" target="_blank"><nobr>Bo Dai</nobr></a><sup>2</sup> &emsp;
+<a href="https://andytang15.github.io/" target="_blank"><nobr>Yansong Tang</nobr></a><sup>1</sup>
+</h2>
+<h2 style='text-align: center'>
+<nobr><sup>1</sup>Tsinghua University</nobr> &emsp;
+<nobr><sup>2</sup>Shanghai AI Laboratory</nobr>
+</h2>
+</div>
+"""
+WEBSITE_bottom = """
+<div class="embed_hidden">
+<p>
+Space adapted from <a href="https://huggingface.co/spaces/Mathux/TMR" target="_blank">TMR</a>
+and <a href="https://huggingface.co/spaces/MeYourHint/MoMask" target="_blank">MoMask</a>.
+</p>
+</div>
+"""
+EXAMPLES = [
+    "a person does a jump",
+    "a person waves both arms in the air.",
+    "The person takes 4 steps backwards.",
+    "this person bends forward as if to bow.",
+    "The person was pushed but did not fall.",
+    "a man walks forward in a snake like pattern.",
+    "a man paces back and forth along the same line.",
+    "with arms out to the sides a person walks forward",
+    "A man bends down and picks something up with his right hand.",
+    "The man walked forward, spun right on one foot and walked back to his original position.",
+    "a person slightly bent over with right hand pressing against the air walks forward slowly"
+]
+CSS = """
+.contour_video {
+    display: flex;
+    flex-direction: column;
+    justify-content: center;
+    align-items: center;
+    z-index: var(--layer-5);
+    border-radius: var(--block-radius);
+    background: var(--background-fill-primary);
+    padding: 0 var(--size-6);
+    max-height: var(--size-screen-h);
+    overflow: hidden;
+}
+"""
+if not os.path.exists("./experiments_t2m/"):
+    os.system("bash prepare/download_pretrained_models.sh")
+if not os.path.exists('./deps/glove/'):
+    os.system("bash prepare/download_glove.sh")
+if not os.path.exists('./deps/sentence-t5-large/'):
+    os.system("bash prepare/prepare_t5.sh")
+if not os.path.exists('./deps/t2m/'):
+    os.system("bash prepare/download_t2m_evaluators.sh")
+if not os.path.exists('./datasets/humanml3d/'):
+    os.system("bash prepare/prepare_tiny_humanml3d.sh")
+DEFAULT_TEXT = "A person is "
+MAX_VIDEOS = 12
+T2M_CFG = "./configs/motionlcm_t2m.yaml"
+device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+cfg = OmegaConf.load(T2M_CFG)
+cfg_model = get_module_config(cfg.model, cfg.model.target)
+cfg = OmegaConf.merge(cfg, cfg_model)
+set_seed(1949)
+name_time_str = osp.join(cfg.NAME, datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S"))
+output_dir = osp.join(cfg.TEST_FOLDER, name_time_str)
+vis_dir = osp.join(output_dir, 'samples')
+os.makedirs(output_dir, exist_ok=False)
+os.makedirs(vis_dir, exist_ok=False)
+state_dict = torch.load(cfg.TEST.CHECKPOINTS, map_location="cpu")["state_dict"]
+print("Loading checkpoints from {}".format(cfg.TEST.CHECKPOINTS))
+lcm_key = 'denoiser.time_embedding.cond_proj.weight'
+is_lcm = False
+if lcm_key in state_dict:
+    is_lcm = True
+    time_cond_proj_dim = state_dict[lcm_key].shape[1]
+    cfg.model.denoiser.params.time_cond_proj_dim = time_cond_proj_dim
+print(f'Is LCM: {is_lcm}')
+cfg.model.is_controlnet = False
+datasets = get_datasets(cfg, phase="test")[0]
+model = MLD(cfg, datasets)
+model.to(device)
+model.eval()
+model.load_state_dict(state_dict)
+@torch.no_grad()
+def generate(text, motion_len, num_videos):
+    batch = {"text": [text] * num_videos, "length": [motion_len] * num_videos}
+    s = time.time()
+    joints, _ = model(batch)
+    runtime = round(time.time() - s, 3)
+    runtime_info = f'Inference {len(joints)} motions, runtime: {runtime}s, device: {device}'
+    path = []
+    for i in range(num_videos):
+        uid = random.randrange(999999999)
+        video_path = osp.join(vis_dir, f"sample_{uid}.mp4")
+        plot_3d_motion(video_path, joints[i].detach().cpu().numpy(), '', fps=20)
+        path.append(video_path)
+    return path, runtime_info
+# HTML component
+def get_video_html(path, video_id, width=700, height=700):
+    video_html = f"""
+<video class="contour_video" width="{width}" height="{height}" preload="auto" muted playsinline onpause="this.load()"
+autoplay loop disablepictureinpicture id="{video_id}">
+  <source src="file/{path}" type="video/mp4">
+  Your browser does not support the video tag.
+</video>
+"""
+    return video_html
+def generate_component(generate_function, text, motion_len, num_videos):
+    if text == DEFAULT_TEXT or text == "" or text is None:
+        return [None for _ in range(MAX_VIDEOS)] + [None]
+    motion_len = max(36, min(int(float(motion_len) * 20), 196))
+    paths, info = generate_function(text, motion_len, num_videos)
+    htmls = [get_video_html(path, idx) for idx, path in enumerate(paths)]
+    htmls = htmls + [None for _ in range(max(0, MAX_VIDEOS - num_videos))]
+    return htmls + [info]
+theme = gr.themes.Default(primary_hue="purple", secondary_hue="gray")
+generate_and_show = partial(generate_component, generate)
+with gr.Blocks(css=CSS, theme=theme) as demo:
+    gr.HTML(WEBSITE)
+    videos = []
+    with gr.Row():
+        with gr.Column(scale=3):
+            text = gr.Textbox(
+                show_label=True,
+                label="Text prompt",
+                value=DEFAULT_TEXT,
+            )
+            with gr.Row():
+                with gr.Column(scale=1):
+                    motion_len = gr.Textbox(
+                        show_label=True,
+                        label="Motion length (in seconds, <=9.8s)",
+                        value=5,
+                        info="Any length exceeding 9.8s will be restricted to 9.8s.",
+                    )
+                with gr.Column(scale=1):
+                    num_videos = gr.Radio(
+                        [1, 4, 8, 12],
+                        label="Videos",
+                        value=8,
+                        info="Number of videos to generate.",
+                    )
+            gen_btn = gr.Button("Generate", variant="primary")
+            clear = gr.Button("Clear", variant="secondary")
+            results = gr.Textbox(show_label=True,
+                                 label='Inference info (runtime and device)',
+                                 info='Real-time inference cannot be achieved using the free CPU. Local GPU deployment is recommended.',
+                                 interactive=False)
+        with gr.Column(scale=2):
+            def generate_example(text, motion_len, num_videos):
+                return generate_and_show(text, motion_len, num_videos)
+            examples = gr.Examples(
+                examples=[[x, None, None] for x in EXAMPLES],
+                inputs=[text, motion_len, num_videos],
+                examples_per_page=12,
+                run_on_click=False,
+                cache_examples=False,
+                fn=generate_example,
+                outputs=[],
+            )
+    for _ in range(3):
+        with gr.Row():
+            for _ in range(4):
+                video = gr.HTML()
+                videos.append(video)
+    # gr.HTML(WEBSITE_bottom)
+    # connect the examples to the output
+    # a bit hacky
+    examples.outputs = videos
+    def load_example(example_id):
+        processed_example = examples.non_none_processed_examples[example_id]
+        return gr.utils.resolve_singleton(processed_example)
+    examples.dataset.click(
+        load_example,
+        inputs=[examples.dataset],
+        outputs=examples.inputs_with_examples,  # type: ignore
+        show_progress=False,
+        postprocess=False,
+        queue=False,
+    ).then(fn=generate_example, inputs=examples.inputs, outputs=videos + [results])
+    gen_btn.click(
+        fn=generate_and_show,
+        inputs=[text, motion_len, num_videos],
+        outputs=videos + [results],
+    )
+    text.submit(
+        fn=generate_and_show,
+        inputs=[text, motion_len, num_videos],
+        outputs=videos + [results],
+    )
+    def clear_videos():
+        return [None for _ in range(MAX_VIDEOS)] + [DEFAULT_TEXT] + [None]
+    clear.click(fn=clear_videos, outputs=videos + [text] + [results])
+demo.launch()

configs/mld_control.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+FOLDER: './experiments_control'
+TEST_FOLDER: './experiments_control_test'
+NAME: 'mld_humanml'
+TRAIN:
+  DATASETS: ['humanml3d']
+  BATCH_SIZE: 128
+  SPLIT: 'train'
+  NUM_WORKERS: 8
+  PERSISTENT_WORKERS: true
+  SEED_VALUE: 1234
+  PRETRAINED: 'experiments_t2m/mld_humanml/mld_humanml.ckpt'
+  validation_steps: -1
+  validation_epochs: 50
+  checkpointing_steps: -1
+  checkpointing_epochs: 50
+  max_train_steps: -1
+  max_train_epochs: 1000
+  learning_rate: 1e-4
+  learning_rate_spatial: 1e-4
+  lr_scheduler: "cosine"
+  lr_warmup_steps: 1000
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_weight_decay: 0.0
+  adam_epsilon: 1e-08
+  max_grad_norm: 1.0
+EVAL:
+  DATASETS: ['humanml3d']
+  BATCH_SIZE: 32
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+TEST:
+  DATASETS: ['humanml3d']
+  BATCH_SIZE: 1
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+  CHECKPOINTS: 'experiments_control/mld_humanml/mld_humanml.ckpt'
+  # Testing Args
+  REPLICATION_TIMES: 1
+  MM_NUM_SAMPLES: 100
+  MM_NUM_REPEATS: 30
+  MM_NUM_TIMES: 10
+  DIVERSITY_TIMES: 300
+  MAX_NUM_SAMPLES: 1024
+DATASET:
+  SMPL_PATH: './deps/smpl'
+  WORD_VERTILIZER_PATH: './deps/glove/'
+  HUMANML3D:
+    PICK_ONE_TEXT: true
+    FRAME_RATE: 20.0
+    UNIT_LEN: 4
+    ROOT: './datasets/humanml3d'
+    SPLIT_ROOT: './datasets/humanml3d'
+  SAMPLER:
+    MAX_LEN: 196
+    MIN_LEN: 40
+    MAX_TEXT_LEN: 20
+METRIC:
+  DIST_SYNC_ON_STEP: true
+  TYPE: ['TM2TMetrics', 'ControlMetrics']
+model:
+  target: 'modules_mld'
+  latent_dim: [1, 256]
+  guidance_scale: 7.5
+  guidance_uncondp: 0.1
+  # ControlNet Args
+  is_controlnet: true
+  is_controlnet_temporal: false
+  training_control_joint: [0]
+  testing_control_joint: [0]
+  training_density: 'random'
+  testing_density: 100
+  control_scale: 1.0
+  vaeloss: true
+  vaeloss_type: 'sum'
+  cond_ratio: 1.0
+  rot_ratio: 0.0
+  t2m_textencoder:
+    dim_word: 300
+    dim_pos_ohot: 15
+    dim_text_hidden: 512
+    dim_coemb_hidden: 512
+  t2m_motionencoder:
+    dim_move_hidden: 512
+    dim_move_latent: 512
+    dim_motion_hidden: 1024
+    dim_motion_latent: 512
+  bert_path: './deps/distilbert-base-uncased'
+  clip_path: './deps/clip-vit-large-patch14'
+  t5_path: './deps/sentence-t5-large'
+  t2m_path: './deps/t2m/'

configs/mld_t2m_infer.yaml ADDED Viewed

	@@ -0,0 +1,72 @@

+FOLDER: './experiments_t2m'
+TEST_FOLDER: './experiments_t2m_test'
+NAME: 'mld_humanml'
+TRAIN:
+  DATASETS: ['humanml3d']
+  BATCH_SIZE: 1
+  NUM_WORKERS: 8
+  PERSISTENT_WORKERS: true
+  SEED_VALUE: 1234
+EVAL:
+  DATASETS: ['humanml3d']
+  BATCH_SIZE: 32
+  SPLIT: test
+  NUM_WORKERS: 12
+TEST:
+  DATASETS: ['humanml3d']
+  SPLIT: test
+  BATCH_SIZE: 1
+  NUM_WORKERS: 12
+  CHECKPOINTS: 'experiments_t2m/mld_humanml/mld_humanml.ckpt'
+  # Testing Args
+  REPLICATION_TIMES: 20
+  MM_NUM_SAMPLES: 100
+  MM_NUM_REPEATS: 30
+  MM_NUM_TIMES: 10
+  DIVERSITY_TIMES: 300
+DATASET:
+  SMPL_PATH: './deps/smpl'
+  WORD_VERTILIZER_PATH: './deps/glove/'
+  HUMANML3D:
+    PICK_ONE_TEXT: true
+    FRAME_RATE: 20.0
+    UNIT_LEN: 4
+    ROOT: './datasets/humanml3d'
+    SPLIT_ROOT: './datasets/humanml3d'
+  SAMPLER:
+    MAX_LEN: 196
+    MIN_LEN: 40
+    MAX_TEXT_LEN: 20
+METRIC:
+  DIST_SYNC_ON_STEP: True
+  TYPE: ['TM2TMetrics']
+model:
+  target: 'modules_mld'
+  latent_dim: [1, 256]
+  guidance_scale: 7.5
+  t2m_textencoder:
+    dim_word: 300
+    dim_pos_ohot: 15
+    dim_text_hidden: 512
+    dim_coemb_hidden: 512
+  t2m_motionencoder:
+    dim_move_hidden: 512
+    dim_move_latent: 512
+    dim_motion_hidden: 1024
+    dim_motion_latent: 512
+  bert_path: './deps/distilbert-base-uncased'
+  clip_path: './deps/clip-vit-large-patch14'
+  t5_path: './deps/sentence-t5-large'
+  t2m_path: './deps/t2m/'

configs/modules/denoiser.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+denoiser:
+  target: mld.models.architectures.mld_denoiser.MldDenoiser
+  params:
+    text_encoded_dim: 768
+    ff_size: 1024
+    num_layers: 9
+    num_heads: 4
+    dropout: 0.1
+    normalize_before: false
+    activation: 'gelu'
+    flip_sin_to_cos: true
+    return_intermediate_dec: false
+    position_embedding: 'learned'
+    arch: 'trans_enc'
+    freq_shift: 0
+    latent_dim: ${model.latent_dim}

configs/modules/motion_vae.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+motion_vae:
+  target: mld.models.architectures.mld_vae.MldVae
+  params:
+    arch: 'encoder_decoder'
+    ff_size: 1024
+    num_layers: 9
+    num_heads: 4
+    dropout: 0.1
+    normalize_before: false
+    activation: 'gelu'
+    position_embedding: 'learned'
+    latent_dim: ${model.latent_dim}
+    nfeats: ${DATASET.NFEATS}

configs/modules/scheduler.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+scheduler:
+  target: diffusers.LCMScheduler
+  num_inference_timesteps: 1
+  params:
+    num_train_timesteps: 1000
+    beta_start: 0.00085
+    beta_end: 0.012
+    beta_schedule: 'scaled_linear'
+    clip_sample: false
+    set_alpha_to_one: false
+noise_scheduler:
+  target: diffusers.DDPMScheduler
+  params:
+    num_train_timesteps: 1000
+    beta_start: 0.00085
+    beta_end: 0.012
+    beta_schedule: 'scaled_linear'
+    variance_type: 'fixed_small'
+    clip_sample: false

configs/modules/text_encoder.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+text_encoder:
+  target: mld.models.architectures.mld_clip.MldTextEncoder
+  params:
+    last_hidden_state: false # if true, the last hidden state is used as the text embedding
+    modelpath: ${model.t5_path}

configs/modules/traj_encoder.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+traj_encoder:
+  target: mld.models.architectures.mld_traj_encoder.MldTrajEncoder
+  params:
+    ff_size: 1024
+    num_layers: 9
+    num_heads: 4
+    dropout: 0.1
+    normalize_before: false
+    activation: 'gelu'
+    position_embedding: 'learned'
+    latent_dim: ${model.latent_dim}
+    nfeats: ${DATASET.NJOINTS}

configs/modules_mld/denoiser.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+denoiser:
+  target: mld.models.architectures.mld_denoiser.MldDenoiser
+  params:
+    text_encoded_dim: 768
+    ff_size: 1024
+    num_layers: 9
+    num_heads: 4
+    dropout: 0.1
+    normalize_before: false
+    activation: 'gelu'
+    flip_sin_to_cos: true
+    return_intermediate_dec: false
+    position_embedding: 'learned'
+    arch: 'trans_enc'
+    freq_shift: 0
+    latent_dim: ${model.latent_dim}

configs/modules_mld/motion_vae.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+motion_vae:
+  target: mld.models.architectures.mld_vae.MldVae
+  params:
+    arch: 'encoder_decoder'
+    ff_size: 1024
+    num_layers: 9
+    num_heads: 4
+    dropout: 0.1
+    normalize_before: false
+    activation: 'gelu'
+    position_embedding: 'learned'
+    latent_dim: ${model.latent_dim}
+    nfeats: ${DATASET.NFEATS}

configs/modules_mld/scheduler.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+scheduler:
+  target: diffusers.DDIMScheduler
+  num_inference_timesteps: 50
+  eta: 0.0
+  params:
+    num_train_timesteps: 1000
+    beta_start: 0.00085
+    beta_end: 0.012
+    beta_schedule: 'scaled_linear'
+    clip_sample: false
+    # below are for ddim
+    set_alpha_to_one: false
+    steps_offset: 1
+noise_scheduler:
+  target: diffusers.DDPMScheduler
+  params:
+    num_train_timesteps: 1000
+    beta_start: 0.00085
+    beta_end: 0.012
+    beta_schedule: 'scaled_linear'
+    variance_type: 'fixed_small'
+    clip_sample: false

configs/modules_mld/text_encoder.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+text_encoder:
+  target: mld.models.architectures.mld_clip.MldTextEncoder
+  params:
+    last_hidden_state: false # if true, the last hidden state is used as the text embedding
+    modelpath: ${model.t5_path}

configs/modules_mld/traj_encoder.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+traj_encoder:
+  target: mld.models.architectures.mld_traj_encoder.MldTrajEncoder
+  params:
+    ff_size: 1024
+    num_layers: 9
+    num_heads: 4
+    dropout: 0.1
+    normalize_before: false
+    activation: 'gelu'
+    position_embedding: 'learned'
+    latent_dim: ${model.latent_dim}
+    nfeats: ${DATASET.NJOINTS}

configs/motionlcm_control.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+FOLDER: './experiments_control'
+TEST_FOLDER: './experiments_control_test'
+NAME: 'motionlcm_humanml'
+TRAIN:
+  DATASETS: ['humanml3d']
+  BATCH_SIZE: 128
+  SPLIT: 'train'
+  NUM_WORKERS: 8
+  PERSISTENT_WORKERS: true
+  SEED_VALUE: 1234
+  PRETRAINED: 'experiments_t2m/motionlcm_humanml/motionlcm_humanml.ckpt'
+  validation_steps: -1
+  validation_epochs: 50
+  checkpointing_steps: -1
+  checkpointing_epochs: 50
+  max_train_steps: -1
+  max_train_epochs: 1000
+  learning_rate: 1e-4
+  learning_rate_spatial: 1e-4
+  lr_scheduler: "cosine"
+  lr_warmup_steps: 1000
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_weight_decay: 0.0
+  adam_epsilon: 1e-08
+  max_grad_norm: 1.0
+EVAL:
+  DATASETS: ['humanml3d']
+  BATCH_SIZE: 32
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+TEST:
+  DATASETS: ['humanml3d']
+  BATCH_SIZE: 1
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+  CHECKPOINTS: 'experiments_control/motionlcm_humanml/motionlcm_humanml.ckpt'
+  # Testing Args
+  REPLICATION_TIMES: 1
+  MM_NUM_SAMPLES: 100
+  MM_NUM_REPEATS: 30
+  MM_NUM_TIMES: 10
+  DIVERSITY_TIMES: 300
+  MAX_NUM_SAMPLES: 1024
+DATASET:
+  SMPL_PATH: './deps/smpl'
+  WORD_VERTILIZER_PATH: './deps/glove/'
+  HUMANML3D:
+    PICK_ONE_TEXT: true
+    FRAME_RATE: 20.0
+    UNIT_LEN: 4
+    ROOT: './datasets/humanml3d'
+    SPLIT_ROOT: './datasets/humanml3d'
+  SAMPLER:
+    MAX_LEN: 196
+    MIN_LEN: 40
+    MAX_TEXT_LEN: 20
+METRIC:
+  DIST_SYNC_ON_STEP: true
+  TYPE: ['TM2TMetrics', 'ControlMetrics']
+model:
+  target: 'modules'
+  latent_dim: [1, 256]
+  guidance_scale: 7.5
+  guidance_uncondp: 0.0
+  # ControlNet Args
+  is_controlnet: true
+  is_controlnet_temporal: false
+  training_control_joint: [0]
+  testing_control_joint: [0]
+  training_density: 'random'
+  testing_density: 100
+  control_scale: 1.0
+  vaeloss: true
+  vaeloss_type: 'sum'
+  cond_ratio: 1.0
+  rot_ratio: 0.0
+  t2m_textencoder:
+    dim_word: 300
+    dim_pos_ohot: 15
+    dim_text_hidden: 512
+    dim_coemb_hidden: 512
+  t2m_motionencoder:
+    dim_move_hidden: 512
+    dim_move_latent: 512
+    dim_motion_hidden: 1024
+    dim_motion_latent: 512
+  bert_path: './deps/distilbert-base-uncased'
+  clip_path: './deps/clip-vit-large-patch14'
+  t5_path: './deps/sentence-t5-large'
+  t2m_path: './deps/t2m/'

configs/motionlcm_t2m.yaml ADDED Viewed

	@@ -0,0 +1,100 @@

+FOLDER: './experiments_t2m'
+TEST_FOLDER: './experiments_t2m_test'
+NAME: 'motionlcm_humanml'
+TRAIN:
+  DATASETS: ['humanml3d']
+  BATCH_SIZE: 256
+  SPLIT: 'train'
+  NUM_WORKERS: 8
+  PERSISTENT_WORKERS: true
+  SEED_VALUE: 1234
+  PRETRAINED: 'experiments_t2m/mld_humanml/mld_humanml.ckpt'
+  validation_steps: -1
+  validation_epochs: 50
+  checkpointing_steps: -1
+  checkpointing_epochs: 50
+  max_train_steps: -1
+  max_train_epochs: 1000
+  learning_rate: 2e-4
+  lr_scheduler: "cosine"
+  lr_warmup_steps: 1000
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_weight_decay: 0.0
+  adam_epsilon: 1e-08
+  max_grad_norm: 1.0
+  # Latent Consistency Distillation Specific Arguments
+  w_min: 5.0
+  w_max: 15.0
+  num_ddim_timesteps: 50
+  loss_type: 'huber'
+  huber_c: 0.001
+  unet_time_cond_proj_dim: 256
+  ema_decay: 0.95
+EVAL:
+  DATASETS: ['humanml3d']
+  BATCH_SIZE: 32
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+TEST:
+  DATASETS: ['humanml3d']
+  BATCH_SIZE: 1
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+  CHECKPOINTS: 'experiments_t2m/motionlcm_humanml/motionlcm_humanml.ckpt'
+  # Testing Args
+  REPLICATION_TIMES: 20
+  MM_NUM_SAMPLES: 100
+  MM_NUM_REPEATS: 30
+  MM_NUM_TIMES: 10
+  DIVERSITY_TIMES: 300
+DATASET:
+  SMPL_PATH: './deps/smpl'
+  WORD_VERTILIZER_PATH: './deps/glove/'
+  HUMANML3D:
+    PICK_ONE_TEXT: true
+    FRAME_RATE: 20.0
+    UNIT_LEN: 4
+    ROOT: './datasets/humanml3d'
+    SPLIT_ROOT: './datasets/humanml3d'
+  SAMPLER:
+    MAX_LEN: 196
+    MIN_LEN: 40
+    MAX_TEXT_LEN: 20
+METRIC:
+  DIST_SYNC_ON_STEP: true
+  TYPE: ['TM2TMetrics']
+model:
+  target: 'modules'
+  latent_dim: [1, 256]
+  guidance_scale: 7.5
+  guidance_uncondp: 0.0
+  is_controlnet: false
+  t2m_textencoder:
+    dim_word: 300
+    dim_pos_ohot: 15
+    dim_text_hidden: 512
+    dim_coemb_hidden: 512
+  t2m_motionencoder:
+    dim_move_hidden: 512
+    dim_move_latent: 512
+    dim_motion_hidden: 1024
+    dim_motion_latent: 512
+  bert_path: './deps/distilbert-base-uncased'
+  clip_path: './deps/clip-vit-large-patch14'
+  t5_path: './deps/sentence-t5-large'
+  t2m_path: './deps/t2m/'

demo.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import os
+import pickle
+import sys
+import datetime
+import logging
+import os.path as osp
+from omegaconf import OmegaConf
+import torch
+from mld.config import parse_args
+from mld.data.get_data import get_datasets
+from mld.models.modeltype.mld import MLD
+from mld.utils.utils import set_seed, move_batch_to_device
+from mld.data.humanml.utils.plot_script import plot_3d_motion
+from mld.utils.temos_utils import remove_padding
+def load_example_input(text_path: str) -> tuple:
+    with open(text_path, "r") as f:
+        lines = f.readlines()
+    count = 0
+    texts, lens = [], []
+    # Strips the newline character
+    for line in lines:
+        count += 1
+        s = line.strip()
+        s_l = s.split(" ")[0]
+        s_t = s[(len(s_l) + 1):]
+        lens.append(int(s_l))
+        texts.append(s_t)
+    return texts, lens
+def main():
+    cfg = parse_args()
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    set_seed(cfg.TRAIN.SEED_VALUE)
+    name_time_str = osp.join(cfg.NAME, "demo_" + datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S"))
+    output_dir = osp.join(cfg.TEST_FOLDER, name_time_str)
+    vis_dir = osp.join(output_dir, 'samples')
+    os.makedirs(output_dir, exist_ok=False)
+    os.makedirs(vis_dir, exist_ok=False)
+    steam_handler = logging.StreamHandler(sys.stdout)
+    file_handler = logging.FileHandler(osp.join(output_dir, 'output.log'))
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+                        datefmt="%m/%d/%Y %H:%M:%S",
+                        handlers=[steam_handler, file_handler])
+    logger = logging.getLogger(__name__)
+    OmegaConf.save(cfg, osp.join(output_dir, 'config.yaml'))
+    state_dict = torch.load(cfg.TEST.CHECKPOINTS, map_location="cpu")["state_dict"]
+    logger.info("Loading checkpoints from {}".format(cfg.TEST.CHECKPOINTS))
+    lcm_key = 'denoiser.time_embedding.cond_proj.weight'
+    is_lcm = False
+    if lcm_key in state_dict:
+        is_lcm = True
+        time_cond_proj_dim = state_dict[lcm_key].shape[1]
+        cfg.model.denoiser.params.time_cond_proj_dim = time_cond_proj_dim
+    logger.info(f'Is LCM: {is_lcm}')
+    cn_key = "controlnet.controlnet_cond_embedding.0.weight"
+    is_controlnet = True if cn_key in state_dict else False
+    cfg.model.is_controlnet = is_controlnet
+    logger.info(f'Is Controlnet: {is_controlnet}')
+    datasets = get_datasets(cfg, phase="test")[0]
+    model = MLD(cfg, datasets)
+    model.to(device)
+    model.eval()
+    model.load_state_dict(state_dict)
+    # example only support text-to-motion
+    if cfg.example is not None and not is_controlnet:
+        text, length = load_example_input(cfg.example)
+        for t, l in zip(text, length):
+            logger.info(f"{l}: {t}")
+        batch = {"length": length, "text": text}
+        for rep_i in range(cfg.replication):
+            with torch.no_grad():
+                joints, _ = model(batch)
+            num_samples = len(joints)
+            batch_id = 0
+            for i in range(num_samples):
+                res = dict()
+                pkl_path = osp.join(vis_dir, f"batch_id_{batch_id}_sample_id_{i}_length_{length[i]}_rep_{rep_i}.pkl")
+                res['joints'] = joints[i].detach().cpu().numpy()
+                res['text'] = text[i]
+                res['length'] = length[i]
+                res['hint'] = None
+                with open(pkl_path, 'wb') as f:
+                    pickle.dump(res, f)
+                logger.info(f"Motions are generated here:\n{pkl_path}")
+                if not cfg.no_plot:
+                    plot_3d_motion(pkl_path.replace('.pkl', '.mp4'), joints[i].detach().cpu().numpy(), text[i], fps=20)
+    else:
+        test_dataloader = datasets.test_dataloader()
+        for rep_i in range(cfg.replication):
+            for batch_id, batch in enumerate(test_dataloader):
+                batch = move_batch_to_device(batch, device)
+                with torch.no_grad():
+                    joints, joints_ref = model(batch)
+                num_samples = len(joints)
+                text = batch['text']
+                length = batch['length']
+                if 'hint' in batch:
+                    hint = batch['hint']
+                    mask_hint = hint.view(hint.shape[0], hint.shape[1], model.njoints, 3).sum(dim=-1, keepdim=True) != 0
+                    hint = model.datamodule.denorm_spatial(hint)
+                    hint = hint.view(hint.shape[0], hint.shape[1], model.njoints, 3) * mask_hint
+                    hint = remove_padding(hint, lengths=length)
+                else:
+                    hint = None
+                for i in range(num_samples):
+                    res = dict()
+                    pkl_path = osp.join(vis_dir, f"batch_id_{batch_id}_sample_id_{i}_length_{length[i]}_rep_{rep_i}.pkl")
+                    res['joints'] = joints[i].detach().cpu().numpy()
+                    res['text'] = text[i]
+                    res['length'] = length[i]
+                    res['hint'] = hint[i].detach().cpu().numpy() if hint is not None else None
+                    with open(pkl_path, 'wb') as f:
+                        pickle.dump(res, f)
+                    logger.info(f"Motions are generated here:\n{pkl_path}")
+                    if not cfg.no_plot:
+                        plot_3d_motion(pkl_path.replace('.pkl', '.mp4'), joints[i].detach().cpu().numpy(),
+                                       text[i], fps=20, hint=hint[i].detach().cpu().numpy() if hint is not None else None)
+                    if rep_i == 0:
+                        res['joints'] = joints_ref[i].detach().cpu().numpy()
+                        with open(pkl_path.replace('.pkl', '_ref.pkl'), 'wb') as f:
+                            pickle.dump(res, f)
+                        logger.info(f"Motions are generated here:\n{pkl_path.replace('.pkl', '_ref.pkl')}")
+                        if not cfg.no_plot:
+                            plot_3d_motion(pkl_path.replace('.pkl', '_ref.mp4'), joints_ref[i].detach().cpu().numpy(),
+                                           text[i], fps=20, hint=hint[i].detach().cpu().numpy() if hint is not None else None)
+if __name__ == "__main__":
+    main()

fit.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# borrow from optimization https://github.com/wangsen1312/joints2smpl
+import os
+import argparse
+import pickle
+import h5py
+import natsort
+import smplx
+import torch
+from mld.transforms.joints2rots import config
+from mld.transforms.joints2rots.smplify import SMPLify3D
+parser = argparse.ArgumentParser()
+parser.add_argument("--pkl", type=str, default=None, help="pkl motion file")
+parser.add_argument("--dir", type=str, default=None, help="pkl motion folder")
+parser.add_argument("--num_smplify_iters", type=int, default=150, help="num of smplify iters")
+parser.add_argument("--cuda", type=bool, default=True, help="enables cuda")
+parser.add_argument("--gpu_ids", type=int, default=0, help="choose gpu ids")
+parser.add_argument("--num_joints", type=int, default=22, help="joint number")
+parser.add_argument("--joint_category", type=str, default="AMASS", help="use correspondence")
+parser.add_argument("--fix_foot", type=str, default="False", help="fix foot or not")
+opt = parser.parse_args()
+print(opt)
+if opt.pkl:
+    paths = [opt.pkl]
+elif opt.dir:
+    paths = []
+    file_list = natsort.natsorted(os.listdir(opt.dir))
+    for item in file_list:
+        if item.endswith('.pkl') and not item.endswith("_mesh.pkl"):
+            paths.append(os.path.join(opt.dir, item))
+else:
+    raise ValueError(f'{opt.pkl} and {opt.dir} are both None!')
+for path in paths:
+    # load joints
+    if os.path.exists(path.replace('.pkl', '_mesh.pkl')):
+        print(f"{path} is rendered! skip!")
+        continue
+    with open(path, 'rb') as f:
+        data = pickle.load(f)
+    joints = data['joints']
+    # load predefined something
+    device = torch.device("cuda:" + str(opt.gpu_ids) if opt.cuda else "cpu")
+    print(config.SMPL_MODEL_DIR)
+    smplxmodel = smplx.create(
+        config.SMPL_MODEL_DIR,
+        model_type="smpl",
+        gender="neutral",
+        ext="pkl",
+        batch_size=joints.shape[0],
+    ).to(device)
+    # load the mean pose as original
+    smpl_mean_file = config.SMPL_MEAN_FILE
+    file = h5py.File(smpl_mean_file, "r")
+    init_mean_pose = (
+        torch.from_numpy(file["pose"][:])
+        .unsqueeze(0).repeat(joints.shape[0], 1)
+        .float()
+        .to(device)
+    )
+    init_mean_shape = (
+        torch.from_numpy(file["shape"][:])
+        .unsqueeze(0).repeat(joints.shape[0], 1)
+        .float()
+        .to(device)
+    )
+    cam_trans_zero = torch.Tensor([0.0, 0.0, 0.0]).unsqueeze(0).to(device)
+    # initialize SMPLify
+    smplify = SMPLify3D(
+        smplxmodel=smplxmodel,
+        batch_size=joints.shape[0],
+        joints_category=opt.joint_category,
+        num_iters=opt.num_smplify_iters,
+        device=device,
+    )
+    print("initialize SMPLify3D done!")
+    print("Start SMPLify!")
+    keypoints_3d = torch.Tensor(joints).to(device).float()
+    if opt.joint_category == "AMASS":
+        confidence_input = torch.ones(opt.num_joints)
+        # make sure the foot and ankle
+        if opt.fix_foot:
+            confidence_input[7] = 1.5
+            confidence_input[8] = 1.5
+            confidence_input[10] = 1.5
+            confidence_input[11] = 1.5
+    else:
+        print("Such category not settle down!")
+    # ----- from initial to fitting -------
+    (
+        new_opt_vertices,
+        new_opt_joints,
+        new_opt_pose,
+        new_opt_betas,
+        new_opt_cam_t,
+        new_opt_joint_loss,
+    ) = smplify(
+        init_mean_pose.detach(),
+        init_mean_shape.detach(),
+        cam_trans_zero.detach(),
+        keypoints_3d,
+        conf_3d=confidence_input.to(device)
+    )
+    # fix shape
+    betas = torch.zeros_like(new_opt_betas)
+    root = keypoints_3d[:, 0, :]
+    output = smplxmodel(
+        betas=betas,
+        global_orient=new_opt_pose[:, :3],
+        body_pose=new_opt_pose[:, 3:],
+        transl=root,
+        return_verts=True,
+    )
+    vertices = output.vertices.detach().cpu().numpy()
+    data['vertices'] = vertices
+    save_file = path.replace('.pkl', '_mesh.pkl')
+    with open(save_file, 'wb') as f:
+        pickle.dump(data, f)
+    print(f'vertices saved in {save_file}')

mld/__init__.py ADDED Viewed

File without changes

mld/config.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+import importlib
+from typing import Type, TypeVar
+from argparse import ArgumentParser
+from omegaconf import OmegaConf, DictConfig
+def get_module_config(cfg_model: DictConfig, path: str = "modules") -> DictConfig:
+    files = os.listdir(f'./configs/{path}/')
+    for file in files:
+        if file.endswith('.yaml'):
+            with open(f'./configs/{path}/' + file, 'r') as f:
+                cfg_model.merge_with(OmegaConf.load(f))
+    return cfg_model
+def get_obj_from_str(string: str, reload: bool = False) -> Type:
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+def instantiate_from_config(config: DictConfig) -> TypeVar:
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+def parse_args() -> DictConfig:
+    parser = ArgumentParser()
+    parser.add_argument("--cfg", type=str, required=True, help="config file")
+    # Demo Args
+    parser.add_argument('--example', type=str, required=False, help="input text and lengths with txt format")
+    parser.add_argument('--no-plot', action="store_true", required=False, help="whether plot the skeleton-based motion")
+    parser.add_argument('--replication', type=int, default=1, help="the number of replication of sampling")
+    args = parser.parse_args()
+    cfg = OmegaConf.load(args.cfg)
+    cfg_model = get_module_config(cfg.model, cfg.model.target)
+    cfg = OmegaConf.merge(cfg, cfg_model)
+    cfg.example = args.example
+    cfg.no_plot = args.no_plot
+    cfg.replication = args.replication
+    return cfg

mld/data/HumanML3D.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import copy
+from typing import Callable, Optional
+import numpy as np
+from omegaconf import DictConfig
+import torch
+from .base import BASEDataModule
+from .humanml.dataset import Text2MotionDatasetV2
+from .humanml.scripts.motion_process import recover_from_ric
+class HumanML3DDataModule(BASEDataModule):
+    def __init__(self,
+                 cfg: DictConfig,
+                 batch_size: int,
+                 num_workers: int,
+                 collate_fn: Optional[Callable] = None,
+                 persistent_workers: bool = True,
+                 phase: str = "train",
+                 **kwargs) -> None:
+        super().__init__(batch_size=batch_size,
+                         num_workers=num_workers,
+                         collate_fn=collate_fn,
+                         persistent_workers=persistent_workers)
+        self.hparams = copy.deepcopy(kwargs)
+        self.name = "humanml3d"
+        self.njoints = 22
+        if phase == "text_only":
+            raise NotImplementedError
+        else:
+            self.Dataset = Text2MotionDatasetV2
+        self.cfg = cfg
+        sample_overrides = {"tiny": True, "progress_bar": False}
+        self._sample_set = self.get_sample_set(overrides=sample_overrides)
+        self.nfeats = self._sample_set.nfeats
+    def denorm_spatial(self, hint: torch.Tensor) -> torch.Tensor:
+        raw_mean = torch.tensor(self._sample_set.raw_mean).to(hint)
+        raw_std = torch.tensor(self._sample_set.raw_std).to(hint)
+        hint = hint * raw_std + raw_mean
+        return hint
+    def norm_spatial(self, hint: torch.Tensor) -> torch.Tensor:
+        raw_mean = torch.tensor(self._sample_set.raw_mean).to(hint)
+        raw_std = torch.tensor(self._sample_set.raw_std).to(hint)
+        hint = (hint - raw_mean) / raw_std
+        return hint
+    def feats2joints(self, features: torch.Tensor) -> torch.Tensor:
+        mean = torch.tensor(self.hparams['mean']).to(features)
+        std = torch.tensor(self.hparams['std']).to(features)
+        features = features * std + mean
+        return recover_from_ric(features, self.njoints)
+    def renorm4t2m(self, features: torch.Tensor) -> torch.Tensor:
+        # renorm to t2m norms for using t2m evaluators
+        ori_mean = torch.tensor(self.hparams['mean']).to(features)
+        ori_std = torch.tensor(self.hparams['std']).to(features)
+        eval_mean = torch.tensor(self.hparams['mean_eval']).to(features)
+        eval_std = torch.tensor(self.hparams['std_eval']).to(features)
+        features = features * ori_std + ori_mean
+        features = (features - eval_mean) / eval_std
+        return features
+    def mm_mode(self, mm_on: bool = True) -> None:
+        if mm_on:
+            self.is_mm = True
+            self.name_list = self.test_dataset.name_list
+            self.mm_list = np.random.choice(self.name_list,
+                                            self.cfg.TEST.MM_NUM_SAMPLES,
+                                            replace=False)
+            self.test_dataset.name_list = self.mm_list
+        else:
+            self.is_mm = False
+            self.test_dataset.name_list = self.name_list

mld/data/Kit.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import copy
+from typing import Callable, Optional
+import numpy as np
+from omegaconf import DictConfig
+import torch
+from .base import BASEDataModule
+from .humanml.dataset import Text2MotionDatasetV2
+from .humanml.scripts.motion_process import recover_from_ric
+class KitDataModule(BASEDataModule):
+    def __init__(self,
+                 cfg: DictConfig,
+                 batch_size: int,
+                 num_workers: int,
+                 collate_fn: Optional[Callable] = None,
+                 persistent_workers: bool = True,
+                 phase: str = "train",
+                 **kwargs) -> None:
+        super().__init__(batch_size=batch_size,
+                         num_workers=num_workers,
+                         collate_fn=collate_fn,
+                         persistent_workers=persistent_workers)
+        self.hparams = copy.deepcopy(kwargs)
+        self.name = 'kit'
+        self.njoints = 21
+        if phase == 'text_only':
+            raise NotImplementedError
+        else:
+            self.Dataset = Text2MotionDatasetV2
+        self.cfg = cfg
+        sample_overrides = {"tiny": True, "progress_bar": False}
+        self._sample_set = self.get_sample_set(overrides=sample_overrides)
+        self.nfeats = self._sample_set.nfeats
+    def denorm_spatial(self, hint: torch.Tensor) -> torch.Tensor:
+        raw_mean = torch.tensor(self._sample_set.raw_mean).to(hint)
+        raw_std = torch.tensor(self._sample_set.raw_std).to(hint)
+        hint = hint * raw_std + raw_mean
+        return hint
+    def norm_spatial(self, hint: torch.Tensor) -> torch.Tensor:
+        raw_mean = torch.tensor(self._sample_set.raw_mean).to(hint)
+        raw_std = torch.tensor(self._sample_set.raw_std).to(hint)
+        hint = (hint - raw_mean) / raw_std
+        return hint
+    def feats2joints(self, features: torch.Tensor) -> torch.Tensor:
+        mean = torch.tensor(self.hparams['mean']).to(features)
+        std = torch.tensor(self.hparams['std']).to(features)
+        features = features * std + mean
+        return recover_from_ric(features, self.njoints)
+    def renorm4t2m(self, features: torch.Tensor) -> torch.Tensor:
+        # renorm to t2m norms for using t2m evaluators
+        ori_mean = torch.tensor(self.hparams['mean']).to(features)
+        ori_std = torch.tensor(self.hparams['std']).to(features)
+        eval_mean = torch.tensor(self.hparams['mean_eval']).to(features)
+        eval_std = torch.tensor(self.hparams['std_eval']).to(features)
+        features = features * ori_std + ori_mean
+        features = (features - eval_mean) / eval_std
+        return features
+    def mm_mode(self, mm_on: bool = True) -> None:
+        if mm_on:
+            self.is_mm = True
+            self.name_list = self.test_dataset.name_list
+            self.mm_list = np.random.choice(self.name_list,
+                                            self.cfg.TEST.MM_NUM_SAMPLES,
+                                            replace=False)
+            self.test_dataset.name_list = self.mm_list
+        else:
+            self.is_mm = False
+            self.test_dataset.name_list = self.name_list

mld/data/__init__.py ADDED Viewed

File without changes

mld/data/base.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import copy
+from os.path import join as pjoin
+from typing import Any, Callable
+from torch.utils.data import DataLoader
+from .humanml.dataset import Text2MotionDatasetV2
+class BASEDataModule:
+    def __init__(self, collate_fn: Callable, batch_size: int,
+                 num_workers: int, persistent_workers: bool) -> None:
+        super(BASEDataModule, self).__init__()
+        self.dataloader_options = {
+            "batch_size": batch_size,
+            "num_workers": num_workers,
+            "collate_fn": collate_fn,
+            "persistent_workers": persistent_workers
+        }
+        self.is_mm = False
+    def get_sample_set(self, overrides: dict) -> Text2MotionDatasetV2:
+        sample_params = copy.deepcopy(self.hparams)
+        sample_params.update(overrides)
+        split_file = pjoin(
+            eval(f"self.cfg.DATASET.{self.name.upper()}.SPLIT_ROOT"),
+            self.cfg.EVAL.SPLIT + ".txt",
+        )
+        return self.Dataset(split_file=split_file, **sample_params)
+    def __getattr__(self, item: str) -> Any:
+        if item.endswith("_dataset") and not item.startswith("_"):
+            subset = item[:-len("_dataset")]
+            item_c = "_" + item
+            if item_c not in self.__dict__:
+                subset = subset.upper() if subset != "val" else "EVAL"
+                split = eval(f"self.cfg.{subset}.SPLIT")
+                split_file = pjoin(
+                    eval(f"self.cfg.DATASET.{self.name.upper()}.SPLIT_ROOT"),
+                    eval(f"self.cfg.{subset}.SPLIT") + ".txt",
+                )
+                self.__dict__[item_c] = self.Dataset(split_file=split_file,
+                                                     split=split,
+                                                     **self.hparams)
+            return getattr(self, item_c)
+        classname = self.__class__.__name__
+        raise AttributeError(f"'{classname}' object has no attribute '{item}'")
+    def train_dataloader(self) -> DataLoader:
+        return DataLoader(self.train_dataset, shuffle=True, **self.dataloader_options)
+    def val_dataloader(self) -> DataLoader:
+        dataloader_options = self.dataloader_options.copy()
+        dataloader_options["batch_size"] = self.cfg.EVAL.BATCH_SIZE
+        dataloader_options["num_workers"] = self.cfg.EVAL.NUM_WORKERS
+        dataloader_options["shuffle"] = False
+        return DataLoader(self.val_dataset, **dataloader_options)
+    def test_dataloader(self) -> DataLoader:
+        dataloader_options = self.dataloader_options.copy()
+        dataloader_options["batch_size"] = 1 if self.is_mm else self.cfg.TEST.BATCH_SIZE
+        dataloader_options["num_workers"] = self.cfg.TEST.NUM_WORKERS
+        dataloader_options["shuffle"] = False
+        return DataLoader(self.test_dataset, **dataloader_options)

mld/data/get_data.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from os.path import join as pjoin
+from typing import Callable, Optional
+import numpy as np
+from omegaconf import DictConfig
+from .humanml.utils.word_vectorizer import WordVectorizer
+from .HumanML3D import HumanML3DDataModule
+from .Kit import KitDataModule
+from .base import BASEDataModule
+from .utils import mld_collate
+def get_mean_std(phase: str, cfg: DictConfig, dataset_name: str) -> tuple[np.ndarray, np.ndarray]:
+    name = "t2m" if dataset_name == "humanml3d" else dataset_name
+    assert name in ["t2m", "kit"]
+    if phase in ["val"]:
+        if name == 't2m':
+            data_root = pjoin(cfg.model.t2m_path, name, "Comp_v6_KLD01", "meta")
+        elif name == 'kit':
+            data_root = pjoin(cfg.model.t2m_path, name, "Comp_v6_KLD005", "meta")
+        else:
+            raise ValueError("Only support t2m and kit")
+        mean = np.load(pjoin(data_root, "mean.npy"))
+        std = np.load(pjoin(data_root, "std.npy"))
+    else:
+        data_root = eval(f"cfg.DATASET.{dataset_name.upper()}.ROOT")
+        mean = np.load(pjoin(data_root, "Mean.npy"))
+        std = np.load(pjoin(data_root, "Std.npy"))
+    return mean, std
+def get_WordVectorizer(cfg: DictConfig, phase: str, dataset_name: str) -> Optional[WordVectorizer]:
+    if phase not in ["text_only"]:
+        if dataset_name.lower() in ["humanml3d", "kit"]:
+            return WordVectorizer(cfg.DATASET.WORD_VERTILIZER_PATH, "our_vab")
+        else:
+            raise ValueError("Only support WordVectorizer for HumanML3D")
+    else:
+        return None
+def get_collate_fn(name: str) -> Callable:
+    if name.lower() in ["humanml3d", "kit"]:
+        return mld_collate
+    else:
+        raise NotImplementedError
+dataset_module_map = {"humanml3d": HumanML3DDataModule, "kit": KitDataModule}
+motion_subdir = {"humanml3d": "new_joint_vecs", "kit": "new_joint_vecs"}
+def get_datasets(cfg: DictConfig, phase: str = "train") -> list[BASEDataModule]:
+    dataset_names = eval(f"cfg.{phase.upper()}.DATASETS")
+    datasets = []
+    for dataset_name in dataset_names:
+        if dataset_name.lower() in ["humanml3d", "kit"]:
+            data_root = eval(f"cfg.DATASET.{dataset_name.upper()}.ROOT")
+            mean, std = get_mean_std(phase, cfg, dataset_name)
+            mean_eval, std_eval = get_mean_std("val", cfg, dataset_name)
+            wordVectorizer = get_WordVectorizer(cfg, phase, dataset_name)
+            collate_fn = get_collate_fn(dataset_name)
+            dataset = dataset_module_map[dataset_name.lower()](
+                cfg=cfg,
+                batch_size=cfg.TRAIN.BATCH_SIZE,
+                num_workers=cfg.TRAIN.NUM_WORKERS,
+                collate_fn=collate_fn,
+                persistent_workers=cfg.TRAIN.PERSISTENT_WORKERS,
+                mean=mean,
+                std=std,
+                mean_eval=mean_eval,
+                std_eval=std_eval,
+                w_vectorizer=wordVectorizer,
+                text_dir=pjoin(data_root, "texts"),
+                motion_dir=pjoin(data_root, motion_subdir[dataset_name]),
+                max_motion_length=cfg.DATASET.SAMPLER.MAX_LEN,
+                min_motion_length=cfg.DATASET.SAMPLER.MIN_LEN,
+                max_text_len=cfg.DATASET.SAMPLER.MAX_TEXT_LEN,
+                unit_length=eval(
+                    f"cfg.DATASET.{dataset_name.upper()}.UNIT_LEN"),
+                model_kwargs=cfg.model
+            )
+            datasets.append(dataset)
+        elif dataset_name.lower() in ["humanact12", 'uestc', "amass"]:
+            raise NotImplementedError
+    cfg.DATASET.NFEATS = datasets[0].nfeats
+    cfg.DATASET.NJOINTS = datasets[0].njoints
+    return datasets

mld/data/humanml/__init__.py ADDED Viewed

File without changes

mld/data/humanml/common/quaternion.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+def qinv(q: torch.Tensor) -> torch.Tensor:
+    assert q.shape[-1] == 4, 'q must be a tensor of shape (*, 4)'
+    mask = torch.ones_like(q)
+    mask[..., 1:] = -mask[..., 1:]
+    return q * mask
+def qrot(q: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+    """
+    Rotate vector(s) v about the rotation described by quaternion(s) q.
+    Expects a tensor of shape (*, 4) for q and a tensor of shape (*, 3) for v,
+    where * denotes any number of dimensions.
+    Returns a tensor of shape (*, 3).
+    """
+    assert q.shape[-1] == 4
+    assert v.shape[-1] == 3
+    assert q.shape[:-1] == v.shape[:-1]
+    original_shape = list(v.shape)
+    q = q.contiguous().view(-1, 4)
+    v = v.contiguous().view(-1, 3)
+    qvec = q[:, 1:]
+    uv = torch.cross(qvec, v, dim=1)
+    uuv = torch.cross(qvec, uv, dim=1)
+    return (v + 2 * (q[:, :1] * uv + uuv)).view(original_shape)

mld/data/humanml/dataset.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import codecs as cs
+import random
+from os.path import join as pjoin
+import numpy as np
+from rich.progress import track
+import torch
+from torch.utils import data
+from mld.data.humanml.scripts.motion_process import recover_from_ric
+from .utils.word_vectorizer import WordVectorizer
+class Text2MotionDatasetV2(data.Dataset):
+    def __init__(
+        self,
+        mean: np.ndarray,
+        std: np.ndarray,
+        split_file: str,
+        w_vectorizer: WordVectorizer,
+        max_motion_length: int,
+        min_motion_length: int,
+        max_text_len: int,
+        unit_length: int,
+        motion_dir: str,
+        text_dir: str,
+        tiny: bool = False,
+        progress_bar: bool = True,
+        **kwargs,
+    ) -> None:
+        self.w_vectorizer = w_vectorizer
+        self.max_motion_length = max_motion_length
+        self.min_motion_length = min_motion_length
+        self.max_text_len = max_text_len
+        self.unit_length = unit_length
+        data_dict = {}
+        id_list = []
+        with cs.open(split_file, "r") as f:
+            for line in f.readlines():
+                id_list.append(line.strip())
+        self.id_list = id_list
+        if tiny:
+            progress_bar = False
+            maxdata = 10
+        else:
+            maxdata = 1e10
+        if progress_bar:
+            enumerator = enumerate(
+                track(
+                    id_list,
+                    f"Loading HumanML3D {split_file.split('/')[-1].split('.')[0]}",
+                ))
+        else:
+            enumerator = enumerate(id_list)
+        count = 0
+        bad_count = 0
+        new_name_list = []
+        length_list = []
+        for i, name in enumerator:
+            if count > maxdata:
+                break
+            try:
+                motion = np.load(pjoin(motion_dir, name + ".npy"))
+                if (len(motion)) < self.min_motion_length or (len(motion) >= 200):
+                    bad_count += 1
+                    continue
+                text_data = []
+                flag = False
+                with cs.open(pjoin(text_dir, name + ".txt")) as f:
+                    for line in f.readlines():
+                        text_dict = {}
+                        line_split = line.strip().split("#")
+                        caption = line_split[0]
+                        tokens = line_split[1].split(" ")
+                        f_tag = float(line_split[2])
+                        to_tag = float(line_split[3])
+                        f_tag = 0.0 if np.isnan(f_tag) else f_tag
+                        to_tag = 0.0 if np.isnan(to_tag) else to_tag
+                        text_dict["caption"] = caption
+                        text_dict["tokens"] = tokens
+                        if f_tag == 0.0 and to_tag == 0.0:
+                            flag = True
+                            text_data.append(text_dict)
+                        else:
+                            try:
+                                n_motion = motion[int(f_tag * 20):int(to_tag *
+                                                                      20)]
+                                if (len(n_motion)
+                                    ) < self.min_motion_length or (
+                                        (len(n_motion) >= 200)):
+                                    continue
+                                new_name = (
+                                    random.choice("ABCDEFGHIJKLMNOPQRSTUVW") +
+                                    "_" + name)
+                                while new_name in data_dict:
+                                    new_name = (random.choice(
+                                        "ABCDEFGHIJKLMNOPQRSTUVW") + "_" +
+                                                name)
+                                data_dict[new_name] = {
+                                    "motion": n_motion,
+                                    "length": len(n_motion),
+                                    "text": [text_dict],
+                                }
+                                new_name_list.append(new_name)
+                                length_list.append(len(n_motion))
+                            except:
+                                print(line_split)
+                                print(line_split[2], line_split[3], f_tag, to_tag, name)
+                if flag:
+                    data_dict[name] = {
+                        "motion": motion,
+                        "length": len(motion),
+                        "text": text_data,
+                    }
+                    new_name_list.append(name)
+                    length_list.append(len(motion))
+                    count += 1
+            except:
+                pass
+        name_list, length_list = zip(
+            *sorted(zip(new_name_list, length_list), key=lambda x: x[1]))
+        self.mean = mean
+        self.std = std
+        self.mode = None
+        model_params = kwargs['model_kwargs']
+        if 'is_controlnet' in model_params and model_params.is_controlnet is True:
+            if 'test' in split_file or 'val' in split_file:
+                self.mode = 'eval'
+            else:
+                self.mode = 'train'
+            self.t_ctrl = model_params.is_controlnet_temporal
+            spatial_norm_path = './datasets/humanml_spatial_norm'
+            self.raw_mean = np.load(pjoin(spatial_norm_path, 'Mean_raw.npy'))
+            self.raw_std = np.load(pjoin(spatial_norm_path, 'Std_raw.npy'))
+            self.training_control_joint = np.array(model_params.training_control_joint)
+            self.testing_control_joint = np.array(model_params.testing_control_joint)
+            self.training_density = model_params.training_density
+            self.testing_density = model_params.testing_density
+        self.length_arr = np.array(length_list)
+        self.data_dict = data_dict
+        self.nfeats = motion.shape[1]
+        self.name_list = name_list
+    def __len__(self) -> int:
+        return len(self.name_list)
+    def random_mask(self, joints: np.ndarray, n_joints: int = 22) -> np.ndarray:
+        choose_joint = self.testing_control_joint
+        length = joints.shape[0]
+        density = self.testing_density
+        if density in [1, 2, 5]:
+            choose_seq_num = density
+        else:
+            choose_seq_num = int(length * density / 100)
+        if self.t_ctrl:
+            choose_seq = np.arange(0, choose_seq_num)
+        else:
+            choose_seq = np.random.choice(length, choose_seq_num, replace=False)
+            choose_seq.sort()
+        mask_seq = np.zeros((length, n_joints, 3)).astype(bool)
+        for cj in choose_joint:
+            mask_seq[choose_seq, cj] = True
+        # normalize
+        joints = (joints - self.raw_mean.reshape(n_joints, 3)) / self.raw_std.reshape(n_joints, 3)
+        joints = joints * mask_seq
+        return joints
+    def random_mask_train(self, joints: np.ndarray, n_joints: int = 22) -> np.ndarray:
+        if self.t_ctrl:
+            choose_joint = self.training_control_joint
+        else:
+            num_joints = len(self.training_control_joint)
+            num_joints_control = 1
+            choose_joint = np.random.choice(num_joints, num_joints_control, replace=False)
+            choose_joint = self.training_control_joint[choose_joint]
+        length = joints.shape[0]
+        if self.training_density == 'random':
+            choose_seq_num = np.random.choice(length - 1, 1) + 1
+        else:
+            choose_seq_num = int(length * random.uniform(self.training_density[0], self.training_density[1]) / 100)
+        if self.t_ctrl:
+            choose_seq = np.arange(0, choose_seq_num)
+        else:
+            choose_seq = np.random.choice(length, choose_seq_num, replace=False)
+            choose_seq.sort()
+        mask_seq = np.zeros((length, n_joints, 3)).astype(bool)
+        for cj in choose_joint:
+            mask_seq[choose_seq, cj] = True
+        # normalize
+        joints = (joints - self.raw_mean.reshape(n_joints, 3)) / self.raw_std.reshape(n_joints, 3)
+        joints = joints * mask_seq
+        return joints
+    def __getitem__(self, idx: int) -> tuple:
+        data = self.data_dict[self.name_list[idx]]
+        motion, m_length, text_list = data["motion"], data["length"], data["text"]
+        # Randomly select a caption
+        text_data = random.choice(text_list)
+        caption, tokens = text_data["caption"], text_data["tokens"]
+        if len(tokens) < self.max_text_len:
+            # pad with "unk"
+            tokens = ["sos/OTHER"] + tokens + ["eos/OTHER"]
+            sent_len = len(tokens)
+            tokens = tokens + ["unk/OTHER"
+                               ] * (self.max_text_len + 2 - sent_len)
+        else:
+            # crop
+            tokens = tokens[:self.max_text_len]
+            tokens = ["sos/OTHER"] + tokens + ["eos/OTHER"]
+            sent_len = len(tokens)
+        pos_one_hots = []
+        word_embeddings = []
+        for token in tokens:
+            word_emb, pos_oh = self.w_vectorizer[token]
+            pos_one_hots.append(pos_oh[None, :])
+            word_embeddings.append(word_emb[None, :])
+        pos_one_hots = np.concatenate(pos_one_hots, axis=0)
+        word_embeddings = np.concatenate(word_embeddings, axis=0)
+        # Crop the motions in to times of 4, and introduce small variations
+        if self.unit_length < 10:
+            coin2 = np.random.choice(["single", "single", "double"])
+        else:
+            coin2 = "single"
+        if coin2 == "double":
+            m_length = (m_length // self.unit_length - 1) * self.unit_length
+        elif coin2 == "single":
+            m_length = (m_length // self.unit_length) * self.unit_length
+        idx = random.randint(0, len(motion) - m_length)
+        motion = motion[idx:idx + m_length]
+        hint = None
+        if self.mode is not None:
+            n_joints = 22 if motion.shape[-1] == 263 else 21
+            # hint is global position of the controllable joints
+            joints = recover_from_ric(torch.from_numpy(motion).float(), n_joints)
+            joints = joints.numpy()
+            # control any joints at any time
+            if self.mode == 'train':
+                hint = self.random_mask_train(joints, n_joints)
+            else:
+                hint = self.random_mask(joints, n_joints)
+            hint = hint.reshape(hint.shape[0], -1)
+        "Z Normalization"
+        motion = (motion - self.mean) / self.std
+        # debug check nan
+        if np.any(np.isnan(motion)):
+            raise ValueError("nan in motion")
+        return (
+            word_embeddings,
+            pos_one_hots,
+            caption,
+            sent_len,
+            motion,
+            m_length,
+            "_".join(tokens),
+            hint
+        )

mld/data/humanml/scripts/motion_process.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+from ..common.quaternion import qinv, qrot
+# Recover global angle and positions for rotation dataset
+# root_rot_velocity (B, seq_len, 1)
+# root_linear_velocity (B, seq_len, 2)
+# root_y (B, seq_len, 1)
+# ric_data (B, seq_len, (joint_num - 1)*3)
+# rot_data (B, seq_len, (joint_num - 1)*6)
+# local_velocity (B, seq_len, joint_num*3)
+# foot contact (B, seq_len, 4)
+def recover_root_rot_pos(data: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    rot_vel = data[..., 0]
+    r_rot_ang = torch.zeros_like(rot_vel).to(data.device)
+    '''Get Y-axis rotation from rotation velocity'''
+    r_rot_ang[..., 1:] = rot_vel[..., :-1]
+    r_rot_ang = torch.cumsum(r_rot_ang, dim=-1)
+    r_rot_quat = torch.zeros(data.shape[:-1] + (4,)).to(data.device)
+    r_rot_quat[..., 0] = torch.cos(r_rot_ang)
+    r_rot_quat[..., 2] = torch.sin(r_rot_ang)
+    r_pos = torch.zeros(data.shape[:-1] + (3,)).to(data.device)
+    r_pos[..., 1:, [0, 2]] = data[..., :-1, 1:3]
+    '''Add Y-axis rotation to root position'''
+    r_pos = qrot(qinv(r_rot_quat), r_pos)
+    r_pos = torch.cumsum(r_pos, dim=-2)
+    r_pos[..., 1] = data[..., 3]
+    return r_rot_quat, r_pos
+def recover_from_ric(data: torch.Tensor, joints_num: int) -> torch.Tensor:
+    r_rot_quat, r_pos = recover_root_rot_pos(data)
+    positions = data[..., 4:(joints_num - 1) * 3 + 4]
+    positions = positions.view(positions.shape[:-1] + (-1, 3))
+    '''Add Y-axis rotation to local joints'''
+    positions = qrot(qinv(r_rot_quat[..., None, :]).expand(positions.shape[:-1] + (4,)), positions)
+    '''Add root XZ to joints'''
+    positions[..., 0] += r_pos[..., 0:1]
+    positions[..., 2] += r_pos[..., 2:3]
+    '''Concat root and joints'''
+    positions = torch.cat([r_pos.unsqueeze(-2), positions], dim=-2)
+    return positions

mld/data/humanml/utils/__init__.py ADDED Viewed

File without changes

mld/data/humanml/utils/paramUtil.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import numpy as np
+# Define a kinematic tree for the skeletal structure
+kit_kinematic_chain = [[0, 11, 12, 13, 14, 15], [0, 16, 17, 18, 19, 20], [0, 1, 2, 3, 4], [3, 5, 6, 7], [3, 8, 9, 10]]
+kit_raw_offsets = np.array(
+    [
+        [0, 0, 0],
+        [0, 1, 0],
+        [0, 1, 0],
+        [0, 1, 0],
+        [0, 1, 0],
+        [1, 0, 0],
+        [0, -1, 0],
+        [0, -1, 0],
+        [-1, 0, 0],
+        [0, -1, 0],
+        [0, -1, 0],
+        [1, 0, 0],
+        [0, -1, 0],
+        [0, -1, 0],
+        [0, 0, 1],
+        [0, 0, 1],
+        [-1, 0, 0],
+        [0, -1, 0],
+        [0, -1, 0],
+        [0, 0, 1],
+        [0, 0, 1]
+    ]
+)
+t2m_raw_offsets = np.array([[0, 0, 0],
+                            [1, 0, 0],
+                            [-1, 0, 0],
+                            [0, 1, 0],
+                            [0, -1, 0],
+                            [0, -1, 0],
+                            [0, 1, 0],
+                            [0, -1, 0],
+                            [0, -1, 0],
+                            [0, 1, 0],
+                            [0, 0, 1],
+                            [0, 0, 1],
+                            [0, 1, 0],
+                            [1, 0, 0],
+                            [-1, 0, 0],
+                            [0, 0, 1],
+                            [0, -1, 0],
+                            [0, -1, 0],
+                            [0, -1, 0],
+                            [0, -1, 0],
+                            [0, -1, 0],
+                            [0, -1, 0]])
+t2m_kinematic_chain = [[0, 2, 5, 8, 11], [0, 1, 4, 7, 10], [0, 3, 6, 9, 12, 15], [9, 14, 17, 19, 21],
+                       [9, 13, 16, 18, 20]]
+t2m_left_hand_chain = [[20, 22, 23, 24], [20, 34, 35, 36], [20, 25, 26, 27], [20, 31, 32, 33], [20, 28, 29, 30]]
+t2m_right_hand_chain = [[21, 43, 44, 45], [21, 46, 47, 48], [21, 40, 41, 42], [21, 37, 38, 39], [21, 49, 50, 51]]
+kit_tgt_skel_id = '03950'
+t2m_tgt_skel_id = '000021'

mld/data/humanml/utils/plot_script.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from textwrap import wrap
+from typing import Optional
+import numpy as np
+import matplotlib.pyplot as plt
+import mpl_toolkits.mplot3d.axes3d as p3
+from matplotlib.animation import FuncAnimation
+from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+import mld.data.humanml.utils.paramUtil as paramUtil
+skeleton = paramUtil.t2m_kinematic_chain
+def plot_3d_motion(save_path: str, joints: np.ndarray, title: str,
+                   figsize: tuple[int, int] = (3, 3),
+                   fps: int = 120, radius: int = 3, kinematic_tree: list = skeleton,
+                   hint: Optional[np.ndarray] = None) -> None:
+    title = '\n'.join(wrap(title, 20))
+    def init():
+        ax.set_xlim3d([-radius / 2, radius / 2])
+        ax.set_ylim3d([0, radius])
+        ax.set_zlim3d([-radius / 3., radius * 2 / 3.])
+        fig.suptitle(title, fontsize=10)
+        ax.grid(b=False)
+    def plot_xzPlane(minx, maxx, miny, minz, maxz):
+        # Plot a plane XZ
+        verts = [
+            [minx, miny, minz],
+            [minx, miny, maxz],
+            [maxx, miny, maxz],
+            [maxx, miny, minz]
+        ]
+        xz_plane = Poly3DCollection([verts])
+        xz_plane.set_facecolor((0.5, 0.5, 0.5, 0.5))
+        ax.add_collection3d(xz_plane)
+    # (seq_len, joints_num, 3)
+    data = joints.copy().reshape(len(joints), -1, 3)
+    data *= 1.3  # scale for visualization
+    if hint is not None:
+        mask = hint.sum(-1) != 0
+        hint = hint[mask]
+        hint *= 1.3
+    fig = plt.figure(figsize=figsize)
+    plt.tight_layout()
+    ax = p3.Axes3D(fig)
+    init()
+    MINS = data.min(axis=0).min(axis=0)
+    MAXS = data.max(axis=0).max(axis=0)
+    colors = ["#DD5A37", "#D69E00", "#B75A39", "#DD5A37", "#D69E00",
+              "#FF6D00", "#FF6D00", "#FF6D00", "#FF6D00", "#FF6D00",
+              "#DDB50E", "#DDB50E", "#DDB50E", "#DDB50E", "#DDB50E", ]
+    frame_number = data.shape[0]
+    height_offset = MINS[1]
+    data[:, :, 1] -= height_offset
+    if hint is not None:
+        hint[..., 1] -= height_offset
+    trajec = data[:, 0, [0, 2]]
+    data[..., 0] -= data[:, 0:1, 0]
+    data[..., 2] -= data[:, 0:1, 2]
+    def update(index):
+        ax.lines = []
+        ax.collections = []
+        ax.view_init(elev=120, azim=-90)
+        ax.dist = 7.5
+        plot_xzPlane(MINS[0] - trajec[index, 0], MAXS[0] - trajec[index, 0], 0, MINS[2] - trajec[index, 1],
+                     MAXS[2] - trajec[index, 1])
+        if hint is not None:
+            ax.scatter(hint[..., 0] - trajec[index, 0], hint[..., 1], hint[..., 2] - trajec[index, 1], color="#80B79A")
+        for i, (chain, color) in enumerate(zip(kinematic_tree, colors)):
+            if i < 5:
+                linewidth = 4.0
+            else:
+                linewidth = 2.0
+            ax.plot3D(data[index, chain, 0], data[index, chain, 1], data[index, chain, 2], linewidth=linewidth,
+                      color=color)
+        plt.axis('off')
+        ax.set_xticklabels([])
+        ax.set_yticklabels([])
+        ax.set_zticklabels([])
+    ani = FuncAnimation(fig, update, frames=frame_number, interval=1000 / fps, repeat=False)
+    ani.save(save_path, fps=fps)
+    plt.close()

mld/data/humanml/utils/word_vectorizer.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import pickle
+from os.path import join as pjoin
+import numpy as np
+POS_enumerator = {
+    'VERB': 0,
+    'NOUN': 1,
+    'DET': 2,
+    'ADP': 3,
+    'NUM': 4,
+    'AUX': 5,
+    'PRON': 6,
+    'ADJ': 7,
+    'ADV': 8,
+    'Loc_VIP': 9,
+    'Body_VIP': 10,
+    'Obj_VIP': 11,
+    'Act_VIP': 12,
+    'Desc_VIP': 13,
+    'OTHER': 14,
+}
+Loc_list = ('left', 'right', 'clockwise', 'counterclockwise', 'anticlockwise', 'forward', 'back', 'backward',
+            'up', 'down', 'straight', 'curve')
+Body_list = ('arm', 'chin', 'foot', 'feet', 'face', 'hand', 'mouth', 'leg', 'waist', 'eye', 'knee', 'shoulder', 'thigh')
+Obj_List = ('stair', 'dumbbell', 'chair', 'window', 'floor', 'car', 'ball', 'handrail', 'baseball', 'basketball')
+Act_list = ('walk', 'run', 'swing', 'pick', 'bring', 'kick', 'put', 'squat', 'throw', 'hop', 'dance', 'jump', 'turn',
+            'stumble', 'dance', 'stop', 'sit', 'lift', 'lower', 'raise', 'wash', 'stand', 'kneel', 'stroll',
+            'rub', 'bend', 'balance', 'flap', 'jog', 'shuffle', 'lean', 'rotate', 'spin', 'spread', 'climb')
+Desc_list = ('slowly', 'carefully', 'fast', 'careful', 'slow', 'quickly', 'happy', 'angry', 'sad', 'happily',
+             'angrily', 'sadly')
+VIP_dict = {
+    'Loc_VIP': Loc_list,
+    'Body_VIP': Body_list,
+    'Obj_VIP': Obj_List,
+    'Act_VIP': Act_list,
+    'Desc_VIP': Desc_list,
+}
+class WordVectorizer(object):
+    def __init__(self, meta_root: str, prefix: str) -> None:
+        vectors = np.load(pjoin(meta_root, '%s_data.npy' % prefix))
+        words = pickle.load(open(pjoin(meta_root, '%s_words.pkl' % prefix), 'rb'))
+        word2idx = pickle.load(open(pjoin(meta_root, '%s_idx.pkl' % prefix), 'rb'))
+        self.word2vec = {w: vectors[word2idx[w]] for w in words}
+    def _get_pos_ohot(self, pos: str) -> np.ndarray:
+        pos_vec = np.zeros(len(POS_enumerator))
+        if pos in POS_enumerator:
+            pos_vec[POS_enumerator[pos]] = 1
+        else:
+            pos_vec[POS_enumerator['OTHER']] = 1
+        return pos_vec
+    def __len__(self) -> int:
+        return len(self.word2vec)
+    def __getitem__(self, item: str) -> tuple:
+        word, pos = item.split('/')
+        if word in self.word2vec:
+            word_vec = self.word2vec[word]
+            vip_pos = None
+            for key, values in VIP_dict.items():
+                if word in values:
+                    vip_pos = key
+                    break
+            if vip_pos is not None:
+                pos_vec = self._get_pos_ohot(vip_pos)
+            else:
+                pos_vec = self._get_pos_ohot(pos)
+        else:
+            word_vec = self.word2vec['unk']
+            pos_vec = self._get_pos_ohot('OTHER')
+        return word_vec, pos_vec

mld/data/utils.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch
+def collate_tensors(batch: list) -> torch.Tensor:
+    dims = batch[0].dim()
+    max_size = [max([b.size(i) for b in batch]) for i in range(dims)]
+    size = (len(batch), ) + tuple(max_size)
+    canvas = batch[0].new_zeros(size=size)
+    for i, b in enumerate(batch):
+        sub_tensor = canvas[i]
+        for d in range(dims):
+            sub_tensor = sub_tensor.narrow(d, 0, b.size(d))
+        sub_tensor.add_(b)
+    return canvas
+def mld_collate(batch: list) -> dict:
+    notnone_batches = [b for b in batch if b is not None]
+    notnone_batches.sort(key=lambda x: x[3], reverse=True)
+    adapted_batch = {
+        "motion":
+        collate_tensors([torch.tensor(b[4]).float() for b in notnone_batches]),
+        "text": [b[2] for b in notnone_batches],
+        "length": [b[5] for b in notnone_batches],
+        "word_embs":
+        collate_tensors([torch.tensor(b[0]).float() for b in notnone_batches]),
+        "pos_ohot":
+        collate_tensors([torch.tensor(b[1]).float() for b in notnone_batches]),
+        "text_len":
+        collate_tensors([torch.tensor(b[3]) for b in notnone_batches]),
+        "tokens": [b[6] for b in notnone_batches],
+    }
+    # collate trajectory
+    if notnone_batches[0][-1] is not None:
+        adapted_batch['hint'] = collate_tensors([torch.tensor(b[-1]).float() for b in notnone_batches])
+    return adapted_batch

mld/launch/__init__.py ADDED Viewed

File without changes

mld/launch/blender.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Fix blender path
+import os
+import sys
+from argparse import ArgumentParser
+sys.path.append(os.path.expanduser("~/.local/lib/python3.9/site-packages"))
+# Monkey patch argparse such that
+# blender / python parsing works
+def parse_args(self, args=None, namespace=None):
+    if args is not None:
+        return self.parse_args_bak(args=args, namespace=namespace)
+    try:
+        idx = sys.argv.index("--")
+        args = sys.argv[idx + 1:]  # the list after '--'
+    except ValueError as e:  # '--' not in the list:
+        args = []
+    return self.parse_args_bak(args=args, namespace=namespace)
+setattr(ArgumentParser, 'parse_args_bak', ArgumentParser.parse_args)
+setattr(ArgumentParser, 'parse_args', parse_args)

mld/models/__init__.py ADDED Viewed

File without changes

mld/models/architectures/__init__.py ADDED Viewed

File without changes

mld/models/architectures/mld_clip.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import torch
+import torch.nn as nn
+from transformers import AutoModel, AutoTokenizer
+from sentence_transformers import SentenceTransformer
+class MldTextEncoder(nn.Module):
+    def __init__(self, modelpath: str, last_hidden_state: bool = False) -> None:
+        super().__init__()
+        if 't5' in modelpath:
+            self.text_model = SentenceTransformer(modelpath)
+            self.tokenizer = self.text_model.tokenizer
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(modelpath)
+            self.text_model = AutoModel.from_pretrained(modelpath)
+        self.max_length = self.tokenizer.model_max_length
+        if "clip" in modelpath:
+            self.text_encoded_dim = self.text_model.config.text_config.hidden_size
+            if last_hidden_state:
+                self.name = "clip_hidden"
+            else:
+                self.name = "clip"
+        elif "bert" in modelpath:
+            self.name = "bert"
+            self.text_encoded_dim = self.text_model.config.hidden_size
+        elif 't5' in modelpath:
+            self.name = 't5'
+        else:
+            raise ValueError(f"Model {modelpath} not supported")
+    def forward(self, texts: list[str]) -> torch.Tensor:
+        # get prompt text embeddings
+        if self.name in ["clip", "clip_hidden"]:
+            text_inputs = self.tokenizer(
+                texts,
+                padding="max_length",
+                truncation=True,
+                max_length=self.max_length,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            # split into max length Clip can handle
+            if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+                text_input_ids = text_input_ids[:, :self.tokenizer.model_max_length]
+        elif self.name == "bert":
+            text_inputs = self.tokenizer(texts, return_tensors="pt", padding=True)
+        if self.name == "clip":
+            # (batch_Size, text_encoded_dim)
+            text_embeddings = self.text_model.get_text_features(
+                text_input_ids.to(self.text_model.device))
+            # (batch_Size, 1, text_encoded_dim)
+            text_embeddings = text_embeddings.unsqueeze(1)
+        elif self.name == "clip_hidden":
+            # (batch_Size, seq_length , text_encoded_dim)
+            text_embeddings = self.text_model.text_model(
+                text_input_ids.to(self.text_model.device)).last_hidden_state
+        elif self.name == "bert":
+            # (batch_Size, seq_length , text_encoded_dim)
+            text_embeddings = self.text_model(
+                **text_inputs.to(self.text_model.device)).last_hidden_state
+        elif self.name == 't5':
+            text_embeddings = self.text_model.encode(texts, show_progress_bar=False, convert_to_tensor=True, batch_size=len(texts))
+            text_embeddings = text_embeddings.unsqueeze(1)
+        else:
+            raise NotImplementedError(f"Model {self.name} not implemented")
+        return text_embeddings

mld/models/architectures/mld_denoiser.py ADDED Viewed

	@@ -0,0 +1,172 @@

+from typing import Optional, Union
+import torch
+import torch.nn as nn
+from mld.models.architectures.tools.embeddings import (TimestepEmbedding,
+                                                       Timesteps)
+from mld.models.operator.cross_attention import (SkipTransformerEncoder,
+                                                 TransformerDecoder,
+                                                 TransformerDecoderLayer,
+                                                 TransformerEncoder,
+                                                 TransformerEncoderLayer)
+from mld.models.operator.position_encoding import build_position_encoding
+class MldDenoiser(nn.Module):
+    def __init__(self,
+                 latent_dim: list = [1, 256],
+                 ff_size: int = 1024,
+                 num_layers: int = 6,
+                 num_heads: int = 4,
+                 dropout: float = 0.1,
+                 normalize_before: bool = False,
+                 activation: str = "gelu",
+                 flip_sin_to_cos: bool = True,
+                 return_intermediate_dec: bool = False,
+                 position_embedding: str = "learned",
+                 arch: str = "trans_enc",
+                 freq_shift: float = 0,
+                 text_encoded_dim: int = 768,
+                 time_cond_proj_dim: int = None,
+                 is_controlnet: bool = False) -> None:
+        super().__init__()
+        self.latent_dim = latent_dim[-1]
+        self.text_encoded_dim = text_encoded_dim
+        self.arch = arch
+        self.time_cond_proj_dim = time_cond_proj_dim
+        self.time_proj = Timesteps(text_encoded_dim, flip_sin_to_cos, freq_shift)
+        self.time_embedding = TimestepEmbedding(text_encoded_dim, self.latent_dim, cond_proj_dim=time_cond_proj_dim)
+        if text_encoded_dim != self.latent_dim:
+            self.emb_proj = nn.Sequential(nn.ReLU(), nn.Linear(text_encoded_dim, self.latent_dim))
+        self.query_pos = build_position_encoding(
+            self.latent_dim, position_embedding=position_embedding)
+        if self.arch == "trans_enc":
+            encoder_layer = TransformerEncoderLayer(
+                self.latent_dim,
+                num_heads,
+                ff_size,
+                dropout,
+                activation,
+                normalize_before,
+            )
+            encoder_norm = None if is_controlnet else nn.LayerNorm(self.latent_dim)
+            self.encoder = SkipTransformerEncoder(encoder_layer, num_layers, encoder_norm,
+                                                  return_intermediate=is_controlnet)
+        elif self.arch == "trans_dec":
+            assert not is_controlnet, f"controlnet not supported in architecture: 'trans_dec'"
+            self.mem_pos = build_position_encoding(
+                self.latent_dim, position_embedding=position_embedding)
+            decoder_layer = TransformerDecoderLayer(
+                self.latent_dim,
+                num_heads,
+                ff_size,
+                dropout,
+                activation,
+                normalize_before,
+            )
+            decoder_norm = nn.LayerNorm(self.latent_dim)
+            self.decoder = TransformerDecoder(
+                decoder_layer,
+                num_layers,
+                decoder_norm,
+                return_intermediate=return_intermediate_dec,
+            )
+        else:
+            raise ValueError(f"Not supported architecture: {self.arch}!")
+        self.is_controlnet = is_controlnet
+        def zero_module(module):
+            for p in module.parameters():
+                nn.init.zeros_(p)
+            return module
+        if self.is_controlnet:
+            self.controlnet_cond_embedding = nn.Sequential(
+                nn.Linear(self.latent_dim, self.latent_dim),
+                nn.Linear(self.latent_dim, self.latent_dim),
+                zero_module(nn.Linear(self.latent_dim, self.latent_dim))
+            )
+            self.controlnet_down_mid_blocks = nn.ModuleList([
+                zero_module(nn.Linear(self.latent_dim, self.latent_dim)) for _ in range(num_layers)])
+    def forward(self,
+                sample: torch.Tensor,
+                timestep: torch.Tensor,
+                encoder_hidden_states: torch.Tensor,
+                timestep_cond: Optional[torch.Tensor] = None,
+                controlnet_cond: Optional[torch.Tensor] = None,
+                controlnet_residuals: Optional[list[torch.Tensor]] = None
+                ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        # 0. dimension matching
+        # sample [latent_dim[0], batch_size, latent_dim] <= [batch_size, latent_dim[0], latent_dim[1]]
+        sample = sample.permute(1, 0, 2)
+        # 1. check if controlnet
+        if self.is_controlnet:
+            controlnet_cond = controlnet_cond.permute(1, 0, 2)
+            sample = sample + self.controlnet_cond_embedding(controlnet_cond)
+        # 2. time_embedding
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timestep.expand(sample.shape[1]).clone()
+        time_emb = self.time_proj(timesteps)
+        time_emb = time_emb.to(dtype=sample.dtype)
+        # [1, bs, latent_dim] <= [bs, latent_dim]
+        time_emb = self.time_embedding(time_emb, timestep_cond).unsqueeze(0)
+        # 3. condition + time embedding
+        # text_emb [seq_len, batch_size, text_encoded_dim] <= [batch_size, seq_len, text_encoded_dim]
+        encoder_hidden_states = encoder_hidden_states.permute(1, 0, 2)
+        text_emb = encoder_hidden_states  # [num_words, bs, latent_dim]
+        # text embedding projection
+        if self.text_encoded_dim != self.latent_dim:
+            # [1 or 2, bs, latent_dim] <= [1 or 2, bs, text_encoded_dim]
+            text_emb_latent = self.emb_proj(text_emb)
+        else:
+            text_emb_latent = text_emb
+        emb_latent = torch.cat((time_emb, text_emb_latent), 0)
+        # 4. transformer
+        if self.arch == "trans_enc":
+            xseq = torch.cat((sample, emb_latent), axis=0)
+            xseq = self.query_pos(xseq)
+            tokens = self.encoder(xseq, controlnet_residuals=controlnet_residuals)
+            if self.is_controlnet:
+                control_res_samples = []
+                for res, block in zip(tokens, self.controlnet_down_mid_blocks):
+                    r = block(res)
+                    control_res_samples.append(r)
+                return control_res_samples
+            sample = tokens[:sample.shape[0]]
+        elif self.arch == "trans_dec":
+            # tgt    - [1 or 5 or 10, bs, latent_dim]
+            # memory - [token_num, bs, latent_dim]
+            sample = self.query_pos(sample)
+            emb_latent = self.mem_pos(emb_latent)
+            sample = self.decoder(tgt=sample, memory=emb_latent).squeeze(0)
+        else:
+            raise TypeError(f"{self.arch} is not supported")
+        # 5. [batch_size, latent_dim[0], latent_dim[1]] <= [latent_dim[0], batch_size, latent_dim[1]]
+        sample = sample.permute(1, 0, 2)
+        return sample

mld/models/architectures/mld_traj_encoder.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from typing import Optional
+import torch
+import torch.nn as nn
+from mld.models.operator.cross_attention import SkipTransformerEncoder, TransformerEncoderLayer
+from mld.models.operator.position_encoding import build_position_encoding
+from mld.utils.temos_utils import lengths_to_mask
+class MldTrajEncoder(nn.Module):
+    def __init__(self,
+                 nfeats: int,
+                 latent_dim: list = [1, 256],
+                 ff_size: int = 1024,
+                 num_layers: int = 9,
+                 num_heads: int = 4,
+                 dropout: float = 0.1,
+                 normalize_before: bool = False,
+                 activation: str = "gelu",
+                 position_embedding: str = "learned") -> None:
+        super().__init__()
+        self.latent_size = latent_dim[0]
+        self.latent_dim = latent_dim[-1]
+        self.skel_embedding = nn.Linear(nfeats * 3, self.latent_dim)
+        self.query_pos_encoder = build_position_encoding(
+            self.latent_dim, position_embedding=position_embedding)
+        encoder_layer = TransformerEncoderLayer(
+            self.latent_dim,
+            num_heads,
+            ff_size,
+            dropout,
+            activation,
+            normalize_before,
+        )
+        encoder_norm = nn.LayerNorm(self.latent_dim)
+        self.encoder = SkipTransformerEncoder(encoder_layer, num_layers,
+                                              encoder_norm)
+        self.global_motion_token = nn.Parameter(
+            torch.randn(self.latent_size, self.latent_dim))
+    def forward(self, features: torch.Tensor, lengths: Optional[list[int]] = None,
+                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if lengths is None and mask is None:
+            lengths = [len(feature) for feature in features]
+            mask = lengths_to_mask(lengths, features.device)
+        bs, nframes, nfeats = features.shape
+        x = features
+        # Embed each human poses into latent vectors
+        x = self.skel_embedding(x)
+        # Switch sequence and batch_size because the input of
+        # Pytorch Transformer is [Sequence, Batch size, ...]
+        x = x.permute(1, 0, 2)  # now it is [nframes, bs, latent_dim]
+        # Each batch has its own set of tokens
+        dist = torch.tile(self.global_motion_token[:, None, :], (1, bs, 1))
+        # create a bigger mask, to allow attend to emb
+        dist_masks = torch.ones((bs, dist.shape[0]), dtype=torch.bool, device=x.device)
+        aug_mask = torch.cat((dist_masks, mask), 1)
+        # adding the embedding token for all sequences
+        xseq = torch.cat((dist, x), 0)
+        xseq = self.query_pos_encoder(xseq)
+        global_token = self.encoder(xseq, src_key_padding_mask=~aug_mask)[:dist.shape[0]]
+        return global_token

mld/models/architectures/mld_vae.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from typing import Optional
+import torch
+import torch.nn as nn
+from torch.distributions.distribution import Distribution
+from mld.models.operator.cross_attention import (
+    SkipTransformerEncoder,
+    SkipTransformerDecoder,
+    TransformerDecoder,
+    TransformerDecoderLayer,
+    TransformerEncoder,
+    TransformerEncoderLayer,
+)
+from mld.models.operator.position_encoding import build_position_encoding
+from mld.utils.temos_utils import lengths_to_mask
+class MldVae(nn.Module):
+    def __init__(self,
+                 nfeats: int,
+                 latent_dim: list = [1, 256],
+                 ff_size: int = 1024,
+                 num_layers: int = 9,
+                 num_heads: int = 4,
+                 dropout: float = 0.1,
+                 arch: str = "encoder_decoder",
+                 normalize_before: bool = False,
+                 activation: str = "gelu",
+                 position_embedding: str = "learned") -> None:
+        super().__init__()
+        self.latent_size = latent_dim[0]
+        self.latent_dim = latent_dim[-1]
+        input_feats = nfeats
+        output_feats = nfeats
+        self.arch = arch
+        self.query_pos_encoder = build_position_encoding(
+            self.latent_dim, position_embedding=position_embedding)
+        encoder_layer = TransformerEncoderLayer(
+            self.latent_dim,
+            num_heads,
+            ff_size,
+            dropout,
+            activation,
+            normalize_before,
+        )
+        encoder_norm = nn.LayerNorm(self.latent_dim)
+        self.encoder = SkipTransformerEncoder(encoder_layer, num_layers,
+                                              encoder_norm)
+        if self.arch == "all_encoder":
+            decoder_norm = nn.LayerNorm(self.latent_dim)
+            self.decoder = SkipTransformerEncoder(encoder_layer, num_layers,
+                                                  decoder_norm)
+        elif self.arch == 'encoder_decoder':
+            self.query_pos_decoder = build_position_encoding(
+                self.latent_dim, position_embedding=position_embedding)
+            decoder_layer = TransformerDecoderLayer(
+                self.latent_dim,
+                num_heads,
+                ff_size,
+                dropout,
+                activation,
+                normalize_before,
+            )
+            decoder_norm = nn.LayerNorm(self.latent_dim)
+            self.decoder = SkipTransformerDecoder(decoder_layer, num_layers,
+                                                  decoder_norm)
+        else:
+            raise ValueError(f"Not support architecture: {self.arch}!")
+        self.global_motion_token = nn.Parameter(
+            torch.randn(self.latent_size * 2, self.latent_dim))
+        self.skel_embedding = nn.Linear(input_feats, self.latent_dim)
+        self.final_layer = nn.Linear(self.latent_dim, output_feats)
+    def forward(self, features: torch.Tensor,
+                lengths: Optional[list[int]] = None) -> tuple[torch.Tensor, torch.Tensor, Distribution]:
+        z, dist = self.encode(features, lengths)
+        feats_rst = self.decode(z, lengths)
+        return feats_rst, z, dist
+    def encode(self, features: torch.Tensor,
+               lengths: Optional[list[int]] = None) -> tuple[torch.Tensor, Distribution]:
+        if lengths is None:
+            lengths = [len(feature) for feature in features]
+        device = features.device
+        bs, nframes, nfeats = features.shape
+        mask = lengths_to_mask(lengths, device)
+        x = features
+        # Embed each human poses into latent vectors
+        x = self.skel_embedding(x)
+        # Switch sequence and batch_size because the input of
+        # Pytorch Transformer is [Sequence, Batch size, ...]
+        x = x.permute(1, 0, 2)  # now it is [nframes, bs, latent_dim]
+        # Each batch has its own set of tokens
+        dist = torch.tile(self.global_motion_token[:, None, :], (1, bs, 1))
+        # create a bigger mask, to allow attend to emb
+        dist_masks = torch.ones((bs, dist.shape[0]), dtype=torch.bool, device=x.device)
+        aug_mask = torch.cat((dist_masks, mask), 1)
+        # adding the embedding token for all sequences
+        xseq = torch.cat((dist, x), 0)
+        xseq = self.query_pos_encoder(xseq)
+        dist = self.encoder(xseq, src_key_padding_mask=~aug_mask)[:dist.shape[0]]
+        mu = dist[0:self.latent_size, ...]
+        logvar = dist[self.latent_size:, ...]
+        # resampling
+        std = logvar.exp().pow(0.5)
+        dist = torch.distributions.Normal(mu, std)
+        latent = dist.rsample()
+        return latent, dist
+    def decode(self, z: torch.Tensor, lengths: list[int]) -> torch.Tensor:
+        mask = lengths_to_mask(lengths, z.device)
+        bs, nframes = mask.shape
+        queries = torch.zeros(nframes, bs, self.latent_dim, device=z.device)
+        if self.arch == "all_encoder":
+            xseq = torch.cat((z, queries), axis=0)
+            z_mask = torch.ones((bs, self.latent_size), dtype=torch.bool, device=z.device)
+            aug_mask = torch.cat((z_mask, mask), axis=1)
+            xseq = self.query_pos_decoder(xseq)
+            output = self.decoder(xseq, src_key_padding_mask=~aug_mask)[z.shape[0]:]
+        elif self.arch == "encoder_decoder":
+            queries = self.query_pos_decoder(queries)
+            output = self.decoder(
+                tgt=queries,
+                memory=z,
+                tgt_key_padding_mask=~mask)
+        output = self.final_layer(output)
+        # zero for padded area
+        output[~mask.T] = 0
+        # Pytorch Transformer: [Sequence, Batch size, ...]
+        feats = output.permute(1, 0, 2)
+        return feats

mld/models/architectures/t2m_motionenc.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import pack_padded_sequence
+class MovementConvEncoder(nn.Module):
+    def __init__(self, input_size: int, hidden_size: int, output_size: int) -> None:
+        super(MovementConvEncoder, self).__init__()
+        self.main = nn.Sequential(
+            nn.Conv1d(input_size, hidden_size, 4, 2, 1),
+            nn.Dropout(0.2, inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv1d(hidden_size, output_size, 4, 2, 1),
+            nn.Dropout(0.2, inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+        )
+        self.out_net = nn.Linear(output_size, output_size)
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        inputs = inputs.permute(0, 2, 1)
+        outputs = self.main(inputs).permute(0, 2, 1)
+        return self.out_net(outputs)
+class MotionEncoderBiGRUCo(nn.Module):
+    def __init__(self, input_size: int, hidden_size: int, output_size: int) -> None:
+        super(MotionEncoderBiGRUCo, self).__init__()
+        self.input_emb = nn.Linear(input_size, hidden_size)
+        self.gru = nn.GRU(
+            hidden_size, hidden_size, batch_first=True, bidirectional=True
+        )
+        self.output_net = nn.Sequential(
+            nn.Linear(hidden_size * 2, hidden_size),
+            nn.LayerNorm(hidden_size),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Linear(hidden_size, output_size),
+        )
+        self.hidden_size = hidden_size
+        self.hidden = nn.Parameter(
+            torch.randn((2, 1, self.hidden_size), requires_grad=True)
+        )
+    def forward(self, inputs: torch.Tensor, m_lens: torch.Tensor) -> torch.Tensor:
+        num_samples = inputs.shape[0]
+        input_embs = self.input_emb(inputs)
+        hidden = self.hidden.repeat(1, num_samples, 1)
+        cap_lens = m_lens.data.tolist()
+        emb = pack_padded_sequence(input_embs, cap_lens, batch_first=True)
+        gru_seq, gru_last = self.gru(emb, hidden)
+        gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1)
+        return self.output_net(gru_last)

mld/models/architectures/t2m_textenc.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import pack_padded_sequence
+class TextEncoderBiGRUCo(nn.Module):
+    def __init__(self, word_size: int, pos_size: int, hidden_size: int, output_size: int) -> None:
+        super(TextEncoderBiGRUCo, self).__init__()
+        self.pos_emb = nn.Linear(pos_size, word_size)
+        self.input_emb = nn.Linear(word_size, hidden_size)
+        self.gru = nn.GRU(
+            hidden_size, hidden_size, batch_first=True, bidirectional=True
+        )
+        self.output_net = nn.Sequential(
+            nn.Linear(hidden_size * 2, hidden_size),
+            nn.LayerNorm(hidden_size),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Linear(hidden_size, output_size),
+        )
+        self.hidden_size = hidden_size
+        self.hidden = nn.Parameter(
+            torch.randn((2, 1, self.hidden_size), requires_grad=True)
+        )
+    def forward(self, word_embs: torch.Tensor, pos_onehot: torch.Tensor,
+                cap_lens: torch.Tensor) -> torch.Tensor:
+        num_samples = word_embs.shape[0]
+        pos_embs = self.pos_emb(pos_onehot)
+        inputs = word_embs + pos_embs
+        input_embs = self.input_emb(inputs)
+        hidden = self.hidden.repeat(1, num_samples, 1)
+        cap_lens = cap_lens.data.tolist()
+        emb = pack_padded_sequence(input_embs, cap_lens, batch_first=True)
+        gru_seq, gru_last = self.gru(emb, hidden)
+        gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1)
+        return self.output_net(gru_last)

mld/models/architectures/tools/embeddings.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+) -> torch.Tensor:
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+class TimestepEmbedding(nn.Module):
+    def __init__(self, channel: int, time_embed_dim: int,
+                 act_fn: str = "silu", cond_proj_dim: Optional[int] = None) -> None:
+        super().__init__()
+        # distill CFG
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, channel, bias=False)
+            self.cond_proj.weight.data.fill_(0.0)
+        else:
+            self.cond_proj = None
+        self.linear_1 = nn.Linear(channel, time_embed_dim)
+        self.act = None
+        if act_fn == "silu":
+            self.act = nn.SiLU()
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim)
+    def forward(self, sample: torch.Tensor, timestep_cond: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if timestep_cond is not None:
+            sample = sample + self.cond_proj(timestep_cond)
+        sample = self.linear_1(sample)
+        if self.act is not None:
+            sample = self.act(sample)
+        sample = self.linear_2(sample)
+        return sample
+class Timesteps(nn.Module):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool,
+                 downscale_freq_shift: float) -> None:
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+        )
+        return t_emb

mld/models/metrics/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .tm2t import TM2TMetrics
+from .mm import MMMetrics
+from .cm import ControlMetrics

mld/models/metrics/cm.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import torch
+from torchmetrics import Metric
+from torchmetrics.utilities import dim_zero_cat
+from mld.utils.temos_utils import remove_padding
+from .utils import calculate_skating_ratio, calculate_trajectory_error, control_l2
+class ControlMetrics(Metric):
+    def __init__(self, dist_sync_on_step: bool = True) -> None:
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        self.name = "control_metrics"
+        self.add_state("count_seq", default=torch.tensor(0), dist_reduce_fx="sum")
+        self.add_state("skate_ratio_sum", default=torch.tensor(0.), dist_reduce_fx="sum")
+        self.add_state("dist_sum", default=torch.tensor(0.), dist_reduce_fx="sum")
+        self.add_state("traj_err", default=[], dist_reduce_fx="cat")
+        self.traj_err_key = ["traj_fail_20cm", "traj_fail_50cm", "kps_fail_20cm", "kps_fail_50cm", "kps_mean_err(m)"]
+    def compute(self) -> dict:
+        count_seq = self.count_seq.item()
+        metrics = dict()
+        metrics['Skating Ratio'] = self.skate_ratio_sum / count_seq
+        metrics['Control L2 dist'] = self.dist_sum / count_seq
+        traj_err = dim_zero_cat(self.traj_err).mean(0)
+        for (k, v) in zip(self.traj_err_key, traj_err):
+            metrics[k] = v
+        return {**metrics}
+    def update(self, joints: torch.Tensor,  hint: torch.Tensor,
+               mask_hint: torch.Tensor, lengths: list[int]) -> None:
+        self.count_seq += len(lengths)
+        joints_no_padding = remove_padding(joints, lengths)
+        for j in joints_no_padding:
+            skate_ratio, _ = calculate_skating_ratio(j.unsqueeze(0).permute(0, 2, 3, 1))
+            self.skate_ratio_sum += skate_ratio[0]
+        joints_np = joints.cpu().numpy()
+        hint_np = hint.cpu().numpy()
+        mask_hint_np = mask_hint.cpu().numpy()
+        for j, h, m in zip(joints_np, hint_np, mask_hint_np):
+            control_error = control_l2(j[None], h[None], m[None])
+            mean_error = control_error.sum() / m.sum()
+            self.dist_sum += mean_error
+            control_error = control_error.reshape(-1)
+            m = m.reshape(-1)
+            err_np = calculate_trajectory_error(control_error, mean_error, m)
+            self.traj_err.append(torch.tensor(err_np[None], device=joints.device))