Spaces:

mingyuan
/

ReMoDiffuse

Running

App Files Files Community

mingyuan commited on Sep 29, 2023

Commit

a0d91d3

1 Parent(s): 39ee545

initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +1 -0
README.md +6 -9
app.py +123 -0
configs/_base_/datasets/human_ml3d_bs128.py +60 -0
configs/_base_/datasets/kit_ml_bs128.py +60 -0
configs/mdm/mdm_t2m_official.py +67 -0
configs/motiondiffuse/motiondiffuse_kit.py +89 -0
configs/motiondiffuse/motiondiffuse_t2m.py +90 -0
configs/remodiffuse/remodiffuse_kit.py +141 -0
configs/remodiffuse/remodiffuse_t2m.py +141 -0
data/database/t2m_text_train.npz +3 -0
data/datasets/human_ml3d/mean.npy +3 -0
data/datasets/human_ml3d/std.npy +3 -0
data/datasets/kit_ml/mean.npy +3 -0
data/datasets/kit_ml/std.npy +3 -0
logs/mdm/mdm_t2m/latest.pth +3 -0
logs/motiondiffuse/motiondiffuse_t2m/latest.pth +3 -0
logs/remodiffuse/remodiffuse_t2m/latest.pth +3 -0
mogen/__init__.py +56 -0
mogen/apis/__init__.py +13 -0
mogen/apis/test.py +160 -0
mogen/apis/train.py +165 -0
mogen/core/__init__.py +0 -0
mogen/core/distributed_wrapper.py +136 -0
mogen/core/evaluation/__init__.py +4 -0
mogen/core/evaluation/builder.py +29 -0
mogen/core/evaluation/eval_hooks.py +138 -0
mogen/core/evaluation/evaluators/__init__.py +0 -0
mogen/core/evaluation/evaluators/base_evaluator.py +144 -0
mogen/core/evaluation/evaluators/diversity_evaluator.py +52 -0
mogen/core/evaluation/evaluators/fid_evaluator.py +58 -0
mogen/core/evaluation/evaluators/matching_score_evaluator.py +71 -0
mogen/core/evaluation/evaluators/multimodality_evaluator.py +63 -0
mogen/core/evaluation/evaluators/precision_evaluator.py +74 -0
mogen/core/evaluation/get_model.py +46 -0
mogen/core/evaluation/utils.py +130 -0
mogen/core/optimizer/__init__.py +3 -0
mogen/core/optimizer/builder.py +52 -0
mogen/datasets/__init__.py +11 -0
mogen/datasets/base_dataset.py +117 -0
mogen/datasets/builder.py +113 -0
mogen/datasets/dataset_wrappers.py +42 -0
mogen/datasets/pipelines/__init__.py +18 -0
mogen/datasets/pipelines/compose.py +42 -0
mogen/datasets/pipelines/formatting.py +134 -0
mogen/datasets/pipelines/transforms.py +120 -0
mogen/datasets/samplers/__init__.py +3 -0
mogen/datasets/samplers/distributed_sampler.py +42 -0
mogen/datasets/text_motion_dataset.py +93 -0
mogen/models/__init__.py +7 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

README.md CHANGED Viewed

@@ -1,12 +1,9 @@
----
-title: ReMoDiffuse
-emoji: 📚
-colorFrom: red
-colorTo: gray
 sdk: gradio
-sdk_version: 3.43.2
 app_file: app.py
 pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+title: MotionDiffuse
+emoji: 🏢
+colorFrom: blue
+colorTo: red
 sdk: gradio
+sdk_version: 3.44.1
 app_file: app.py
 pinned: false
+license: mit

app.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+import sys
+import gradio as gr
+os.makedirs("outputs", exist_ok=True)
+sys.path.insert(0, '.')
+import argparse
+import os.path as osp
+import mmcv
+import numpy as np
+import torch
+from mogen.models import build_architecture
+from mmcv.runner import load_checkpoint
+from mmcv.parallel import MMDataParallel
+from mogen.utils.plot_utils import (
+    recover_from_ric,
+    plot_3d_motion,
+    t2m_kinematic_chain
+)
+from scipy.ndimage import gaussian_filter
+from IPython.display import Image
+def motion_temporal_filter(motion, sigma=1):
+    motion = motion.reshape(motion.shape[0], -1)
+    for i in range(motion.shape[1]):
+        motion[:, i] = gaussian_filter(motion[:, i], sigma=sigma, mode="nearest")
+    return motion.reshape(motion.shape[0], -1, 3)
+def plot_t2m(data, result_path, npy_path, caption):
+    joint = recover_from_ric(torch.from_numpy(data).float(), 22).numpy()
+    joint = motion_temporal_filter(joint, sigma=2.5)
+    plot_3d_motion(result_path, t2m_kinematic_chain, joint, title=caption, fps=20)
+    if npy_path is not None:
+        np.save(npy_path, joint)
+def create_remodiffuse():
+    config_path = "configs/remodiffuse/remodiffuse_t2m.py"
+    ckpt_path = "logs/remodiffuse/remodiffuse_t2m/latest.pth"
+    cfg = mmcv.Config.fromfile(config_path)
+    model = build_architecture(cfg.model)
+    load_checkpoint(model, ckpt_path, map_location='cpu')
+    model.cpu()
+    model.eval()
+    return model
+def create_motiondiffuse():
+    config_path = "configs/motiondiffuse/motiondiffuse_t2m.py"
+    ckpt_path = "logs/motiondiffuse/motiondiffuse_t2m/latest.pth"
+    cfg = mmcv.Config.fromfile(config_path)
+    model = build_architecture(cfg.model)
+    load_checkpoint(model, ckpt_path, map_location='cpu')
+    model.cpu()
+    model.eval()
+    return model
+def create_mdm():
+    config_path = "configs/mdm/mdm_t2m_official.py"
+    ckpt_path = "logs/mdm/mdm_t2m/latest.pth"
+    cfg = mmcv.Config.fromfile(config_path)
+    model = build_architecture(cfg.model)
+    load_checkpoint(model, ckpt_path, map_location='cpu')
+    model.cpu()
+    model.eval()
+    return model
+model_remodiffuse = create_remodiffuse()
+# model_motiondiffuse = create_motiondiffuse()
+# model_mdm = create_mdm()
+mean_path = "data/datasets/human_ml3d/mean.npy"
+std_path = "data/datasets/human_ml3d/std.npy"
+mean = np.load(mean_path)
+std = np.load(std_path)
+def show_generation_result(model, text, motion_length, result_path):
+    device = 'cpu'
+    motion = torch.zeros(1, motion_length, 263).to(device)
+    motion_mask = torch.ones(1, motion_length).to(device)
+    motion_length = torch.Tensor([motion_length]).long().to(device)
+    model = model.to(device)
+    input = {
+        'motion': motion,
+        'motion_mask': motion_mask,
+        'motion_length': motion_length,
+        'motion_metas': [{'text': text}],
+    }
+    all_pred_motion = []
+    with torch.no_grad():
+        input['inference_kwargs'] = {}
+        output_list = []
+        output = model(**input)[0]['pred_motion']
+        pred_motion = output.cpu().detach().numpy()
+        pred_motion = pred_motion * std + mean
+    plot_t2m(pred_motion, result_path, None, text)
+def generate(prompt, length):
+    if not os.path.exists("outputs"):
+        os.mkdir("outputs")
+    result_path = "outputs/" + str(hash(prompt)) + ".mp4"
+    show_generation_result(model_remodiffuse, prompt, length, result_path)
+    return result_path
+demo = gr.Interface(
+    fn=generate,
+    inputs=["text", gr.Slider(20, 196, value=60)],
+    examples=[
+        ["the man throws a punch with each hand.", 58],
+        ["a person spins quickly and takes off running.", 29],
+        ["a person quickly waves with their right hand", 46],
+        ["a person performing a slight bow", 89],
+    ],
+    outputs="video",
+    title="ReMoDiffuse: Retrieval-Augmented Motion Diffusion Model",
+    description="This is an interactive demo for ReMoDiffuse. For more information, feel free to visit our project page(https://mingyuan-zhang.github.io/projects/ReMoDiffuse.html).")
+demo.queue()
+demo.launch()

configs/_base_/datasets/human_ml3d_bs128.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# dataset settings
+data_keys = ['motion', 'motion_mask', 'motion_length', 'clip_feat']
+meta_keys = ['text', 'token']
+train_pipeline = [
+    dict(
+        type='Normalize',
+        mean_path='data/datasets/human_ml3d/mean.npy',
+        std_path='data/datasets/human_ml3d/std.npy'),
+    dict(type='Crop', crop_size=196),
+    dict(type='ToTensor', keys=data_keys),
+    dict(type='Collect', keys=data_keys, meta_keys=meta_keys)
+]
+data = dict(
+    samples_per_gpu=128,
+    workers_per_gpu=1,
+    train=dict(
+        type='RepeatDataset',
+        dataset=dict(
+            type='TextMotionDataset',
+            dataset_name='human_ml3d',
+            data_prefix='data',
+            pipeline=train_pipeline,
+            ann_file='train.txt',
+            motion_dir='motions',
+            text_dir='texts',
+            token_dir='tokens',
+            clip_feat_dir='clip_feats',
+        ),
+        times=200
+    ),
+    test=dict(
+        type='TextMotionDataset',
+        dataset_name='human_ml3d',
+        data_prefix='data',
+        pipeline=train_pipeline,
+        ann_file='test.txt',
+        motion_dir='motions',
+        text_dir='texts',
+        token_dir='tokens',
+        clip_feat_dir='clip_feats',
+        eval_cfg=dict(
+            shuffle_indexes=True,
+            replication_times=20,
+            replication_reduction='statistics',
+            text_encoder_name='human_ml3d',
+            text_encoder_path='data/evaluators/human_ml3d/finest.tar',
+            motion_encoder_name='human_ml3d',
+            motion_encoder_path='data/evaluators/human_ml3d/finest.tar',
+            metrics=[
+                dict(type='R Precision', batch_size=32, top_k=3),
+                dict(type='Matching Score', batch_size=32),
+                dict(type='FID'),
+                dict(type='Diversity', num_samples=300),
+                dict(type='MultiModality', num_samples=100, num_repeats=30, num_picks=10)
+            ]
+        ),
+        test_mode=True
+    )
+)

configs/_base_/datasets/kit_ml_bs128.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# dataset settings
+data_keys = ['motion', 'motion_mask', 'motion_length', 'clip_feat']
+meta_keys = ['text', 'token']
+train_pipeline = [
+    dict(type='Crop', crop_size=196),
+    dict(
+        type='Normalize',
+        mean_path='data/datasets/kit_ml/mean.npy',
+        std_path='data/datasets/kit_ml/std.npy'),
+    dict(type='ToTensor', keys=data_keys),
+    dict(type='Collect', keys=data_keys, meta_keys=meta_keys)
+]
+data = dict(
+    samples_per_gpu=128,
+    workers_per_gpu=1,
+    train=dict(
+        type='RepeatDataset',
+        dataset=dict(
+            type='TextMotionDataset',
+            dataset_name='kit_ml',
+            data_prefix='data',
+            pipeline=train_pipeline,
+            ann_file='train.txt',
+            motion_dir='motions',
+            text_dir='texts',
+            token_dir='tokens',
+            clip_feat_dir='clip_feats',
+        ),
+        times=100
+    ),
+    test=dict(
+        type='TextMotionDataset',
+        dataset_name='kit_ml',
+        data_prefix='data',
+        pipeline=train_pipeline,
+        ann_file='test.txt',
+        motion_dir='motions',
+        text_dir='texts',
+        token_dir='tokens',
+        clip_feat_dir='clip_feats',
+        eval_cfg=dict(
+            shuffle_indexes=True,
+            replication_times=20,
+            replication_reduction='statistics',
+            text_encoder_name='kit_ml',
+            text_encoder_path='data/evaluators/kit_ml/finest.tar',
+            motion_encoder_name='kit_ml',
+            motion_encoder_path='data/evaluators/kit_ml/finest.tar',
+            metrics=[
+                dict(type='R Precision', batch_size=32, top_k=3),
+                dict(type='Matching Score', batch_size=32),
+                dict(type='FID'),
+                dict(type='Diversity', num_samples=300),
+                dict(type='MultiModality', num_samples=50, num_repeats=30, num_picks=10)
+            ]
+        ),
+        test_mode=True
+    )
+)

configs/mdm/mdm_t2m_official.py ADDED Viewed

	@@ -0,0 +1,67 @@

+_base_ = ['../_base_/datasets/human_ml3d_bs128.py']
+# checkpoint saving
+checkpoint_config = dict(interval=1)
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+# optimizer
+optimizer = dict(type='Adam', lr=1e-4)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step', step=[])
+runner = dict(type='EpochBasedRunner', max_epochs=50)
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+input_feats = 263
+max_seq_len = 196
+latent_dim = 512
+time_embed_dim = 2048
+text_latent_dim = 256
+ff_size = 1024
+num_layers = 8
+num_heads = 4
+dropout = 0.1
+cond_mask_prob = 0.1
+# model settings
+model = dict(
+    type='MotionDiffusion',
+    model=dict(
+        type='MDMTransformer',
+        input_feats=input_feats,
+        latent_dim=latent_dim,
+        ff_size=ff_size,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        dropout=dropout,
+        time_embed_dim=time_embed_dim,
+        cond_mask_prob=cond_mask_prob,
+        guide_scale=2.5,
+        clip_version='ViT-B/32',
+        use_official_ckpt=True
+    ),
+    loss_recon=dict(type='MSELoss', loss_weight=1, reduction='none'),
+    diffusion_train=dict(
+        beta_scheduler='cosine',
+        diffusion_steps=1000,
+        model_mean_type='start_x',
+        model_var_type='fixed_small',
+    ),
+    diffusion_test=dict(
+        beta_scheduler='cosine',
+        diffusion_steps=1000,
+        model_mean_type='start_x',
+        model_var_type='fixed_small',
+    ),
+    inference_type='ddpm'
+)

configs/motiondiffuse/motiondiffuse_kit.py ADDED Viewed

	@@ -0,0 +1,89 @@

+_base_ = ['../_base_/datasets/kit_ml_bs128.py']
+# checkpoint saving
+checkpoint_config = dict(interval=1)
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+# optimizer
+optimizer = dict(type='Adam', lr=2e-4)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step', step=[])
+runner = dict(type='EpochBasedRunner', max_epochs=50)
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+input_feats = 251
+max_seq_len = 196
+latent_dim = 512
+time_embed_dim = 2048
+text_latent_dim = 256
+ff_size = 1024
+num_heads = 8
+dropout = 0
+# model settings
+model = dict(
+    type='MotionDiffusion',
+    model=dict(
+        type='MotionDiffuseTransformer',
+        input_feats=input_feats,
+        max_seq_len=max_seq_len,
+        latent_dim=latent_dim,
+        time_embed_dim=time_embed_dim,
+        num_layers=8,
+        sa_block_cfg=dict(
+            type='EfficientSelfAttention',
+            latent_dim=latent_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            time_embed_dim=time_embed_dim
+        ),
+        ca_block_cfg=dict(
+            type='EfficientCrossAttention',
+            latent_dim=latent_dim,
+            text_latent_dim=text_latent_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            time_embed_dim=time_embed_dim
+        ),
+        ffn_cfg=dict(
+            latent_dim=latent_dim,
+            ffn_dim=ff_size,
+            dropout=dropout,
+            time_embed_dim=time_embed_dim
+        ),
+        text_encoder=dict(
+            pretrained_model='clip',
+            latent_dim=text_latent_dim,
+            num_layers=4,
+            num_heads=4,
+            ff_size=2048,
+            dropout=dropout,
+            use_text_proj=True
+        )
+    ),
+    loss_recon=dict(type='MSELoss', loss_weight=1, reduction='none'),
+    diffusion_train=dict(
+        beta_scheduler='linear',
+        diffusion_steps=1000,
+        model_mean_type='epsilon',
+        model_var_type='fixed_small',
+    ),
+    diffusion_test=dict(
+        beta_scheduler='linear',
+        diffusion_steps=1000,
+        model_mean_type='epsilon',
+        model_var_type='fixed_small',
+    ),
+    inference_type='ddpm'
+)

configs/motiondiffuse/motiondiffuse_t2m.py ADDED Viewed

	@@ -0,0 +1,90 @@

+_base_ = ['../_base_/datasets/human_ml3d_bs128.py']
+# checkpoint saving
+checkpoint_config = dict(interval=1)
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+# optimizer
+optimizer = dict(type='Adam', lr=2e-4)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step', step=[])
+runner = dict(type='EpochBasedRunner', max_epochs=50)
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+input_feats = 263
+max_seq_len = 196
+latent_dim = 512
+time_embed_dim = 2048
+text_latent_dim = 256
+ff_size = 1024
+num_heads = 8
+dropout = 0
+# model settings
+model = dict(
+    type='MotionDiffusion',
+    model=dict(
+        type='MotionDiffuseTransformer',
+        input_feats=input_feats,
+        max_seq_len=max_seq_len,
+        latent_dim=latent_dim,
+        time_embed_dim=time_embed_dim,
+        num_layers=8,
+        sa_block_cfg=dict(
+            type='EfficientSelfAttention',
+            latent_dim=latent_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            time_embed_dim=time_embed_dim
+        ),
+        ca_block_cfg=dict(
+            type='EfficientCrossAttention',
+            latent_dim=latent_dim,
+            text_latent_dim=text_latent_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            time_embed_dim=time_embed_dim
+        ),
+        ffn_cfg=dict(
+            latent_dim=latent_dim,
+            ffn_dim=ff_size,
+            dropout=dropout,
+            time_embed_dim=time_embed_dim
+        ),
+        text_encoder=dict(
+            pretrained_model='clip',
+            latent_dim=text_latent_dim,
+            num_layers=4,
+            num_heads=4,
+            ff_size=2048,
+            dropout=dropout,
+            use_text_proj=True
+        )
+    ),
+    loss_recon=dict(type='MSELoss', loss_weight=1, reduction='none'),
+    diffusion_train=dict(
+        beta_scheduler='linear',
+        diffusion_steps=1000,
+        model_mean_type='epsilon',
+        model_var_type='fixed_small',
+    ),
+    diffusion_test=dict(
+        beta_scheduler='linear',
+        diffusion_steps=1000,
+        model_mean_type='epsilon',
+        model_var_type='fixed_small',
+    ),
+    inference_type='ddpm'
+)
+data = dict(samples_per_gpu=128)

configs/remodiffuse/remodiffuse_kit.py ADDED Viewed

	@@ -0,0 +1,141 @@

+_base_ = ['../_base_/datasets/kit_ml_bs128.py']
+# checkpoint saving
+checkpoint_config = dict(interval=1)
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+# optimizer
+optimizer = dict(type='Adam', lr=2e-4)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='CosineAnnealing', min_lr_ratio=2e-5, by_epoch=False)
+runner = dict(type='EpochBasedRunner', max_epochs=20)
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+input_feats = 251
+max_seq_len = 196
+latent_dim = 512
+time_embed_dim = 2048
+text_latent_dim = 256
+ff_size = 1024
+num_heads = 8
+dropout = 0
+def scale_func(timestep):
+    import random
+    w = (1 - (1000 - timestep) / 1000) * 4.0 + 1
+    if timestep > 100:
+        if random.randint(0, 1) == 0:
+            output = {
+                'both_coef': w,
+                'text_coef': 0,
+                'retr_coef': 1 - w,
+                'none_coef': 0
+            }
+        else:
+            output = {
+                'both_coef': 0,
+                'text_coef': w,
+                'retr_coef': 0,
+                'none_coef': 1 - w
+            }
+    else:
+        both_coef = 0.78123
+        text_coef = 0.39284
+        retr_coef = -0.12475
+        none_coef = 1 - both_coef - text_coef - retr_coef
+        output = {
+            'both_coef': both_coef,
+            'text_coef': text_coef,
+            'retr_coef': retr_coef,
+            'none_coef': none_coef
+        }
+    return output
+# model settings
+model = dict(
+    type='MotionDiffusion',
+    model=dict(
+        type='ReMoDiffuseTransformer',
+        input_feats=input_feats,
+        max_seq_len=max_seq_len,
+        latent_dim=latent_dim,
+        time_embed_dim=time_embed_dim,
+        num_layers=4,
+        ca_block_cfg=dict(
+            type='SemanticsModulatedAttention',
+            latent_dim=latent_dim,
+            text_latent_dim=text_latent_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            time_embed_dim=time_embed_dim
+        ),
+        ffn_cfg=dict(
+            latent_dim=latent_dim,
+            ffn_dim=ff_size,
+            dropout=dropout,
+            time_embed_dim=time_embed_dim
+        ),
+        text_encoder=dict(
+            pretrained_model='clip',
+            latent_dim=text_latent_dim,
+            num_layers=2,
+            ff_size=2048,
+            dropout=dropout,
+            use_text_proj=False
+        ),
+        retrieval_cfg=dict(
+            num_retrieval=2,
+            stride=4,
+            num_layers=2,
+            num_motion_layers=2,
+            kinematic_coef=0.1,
+            topk=2,
+            retrieval_file='data/database/kit_text_train.npz',
+            latent_dim=latent_dim,
+            output_dim=latent_dim,
+            max_seq_len=max_seq_len,
+            num_heads=num_heads,
+            ff_size=ff_size,
+            dropout=dropout,
+            ffn_cfg=dict(
+                latent_dim=latent_dim,
+                ffn_dim=ff_size,
+                dropout=dropout,
+            ),
+            sa_block_cfg=dict(
+                type='EfficientSelfAttention',
+                latent_dim=latent_dim,
+                num_heads=num_heads,
+                dropout=dropout
+            ),
+        ),
+        scale_func=scale_func
+    ),
+    loss_recon=dict(type='MSELoss', loss_weight=1, reduction='none'),
+    diffusion_train=dict(
+        beta_scheduler='linear',
+        diffusion_steps=1000,
+        model_mean_type='start_x',
+        model_var_type='fixed_large',
+    ),
+    diffusion_test=dict(
+        beta_scheduler='linear',
+        diffusion_steps=1000,
+        model_mean_type='start_x',
+        model_var_type='fixed_large',
+        respace='15,15,8,6,6',
+    ),
+    inference_type='ddim'
+)

configs/remodiffuse/remodiffuse_t2m.py ADDED Viewed

	@@ -0,0 +1,141 @@

+_base_ = ['../_base_/datasets/human_ml3d_bs128.py']
+# checkpoint saving
+checkpoint_config = dict(interval=1)
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+# optimizer
+optimizer = dict(type='Adam', lr=2e-4)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='CosineAnnealing', min_lr_ratio=2e-5, by_epoch=False)
+runner = dict(type='EpochBasedRunner', max_epochs=40)
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+input_feats = 263
+max_seq_len = 196
+latent_dim = 512
+time_embed_dim = 2048
+text_latent_dim = 256
+ff_size = 1024
+num_heads = 8
+dropout = 0
+def scale_func(timestep):
+    import random
+    w = (1 - (1000 - timestep) / 1000) * 6.5 + 1
+    if timestep > 100:
+        if random.randint(0, 1) == 0:
+            output = {
+                'both_coef': w,
+                'text_coef': 0,
+                'retr_coef': 1 - w,
+                'none_coef': 0
+            }
+        else:
+            output = {
+                'both_coef': 0,
+                'text_coef': w,
+                'retr_coef': 0,
+                'none_coef': 1 - w
+            }
+    else:
+        both_coef = 0.52351
+        text_coef = -0.28419
+        retr_coef = 2.39872
+        none_coef = 1 - both_coef - text_coef - retr_coef
+        output = {
+            'both_coef': both_coef,
+            'text_coef': text_coef,
+            'retr_coef': retr_coef,
+            'none_coef': none_coef
+        }
+    return output
+# model settings
+model = dict(
+    type='MotionDiffusion',
+    model=dict(
+        type='ReMoDiffuseTransformer',
+        input_feats=input_feats,
+        max_seq_len=max_seq_len,
+        latent_dim=latent_dim,
+        time_embed_dim=time_embed_dim,
+        num_layers=4,
+        ca_block_cfg=dict(
+            type='SemanticsModulatedAttention',
+            latent_dim=latent_dim,
+            text_latent_dim=text_latent_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            time_embed_dim=time_embed_dim
+        ),
+        ffn_cfg=dict(
+            latent_dim=latent_dim,
+            ffn_dim=ff_size,
+            dropout=dropout,
+            time_embed_dim=time_embed_dim
+        ),
+        text_encoder=dict(
+            pretrained_model='clip',
+            latent_dim=text_latent_dim,
+            num_layers=2,
+            ff_size=2048,
+            dropout=dropout,
+            use_text_proj=False
+        ),
+        retrieval_cfg=dict(
+            num_retrieval=2,
+            stride=4,
+            num_layers=2,
+            num_motion_layers=2,
+            kinematic_coef=0.1,
+            topk=2,
+            retrieval_file='data/database/t2m_text_train.npz',
+            latent_dim=latent_dim,
+            output_dim=latent_dim,
+            max_seq_len=max_seq_len,
+            num_heads=num_heads,
+            ff_size=ff_size,
+            dropout=dropout,
+            ffn_cfg=dict(
+                latent_dim=latent_dim,
+                ffn_dim=ff_size,
+                dropout=dropout,
+            ),
+            sa_block_cfg=dict(
+                type='EfficientSelfAttention',
+                latent_dim=latent_dim,
+                num_heads=num_heads,
+                dropout=dropout
+            ),
+        ),
+        scale_func=scale_func
+    ),
+    loss_recon=dict(type='MSELoss', loss_weight=1, reduction='none'),
+    diffusion_train=dict(
+        beta_scheduler='linear',
+        diffusion_steps=1000,
+        model_mean_type='start_x',
+        model_var_type='fixed_large',
+    ),
+    diffusion_test=dict(
+        beta_scheduler='linear',
+        diffusion_steps=1000,
+        model_mean_type='start_x',
+        model_var_type='fixed_large',
+        respace='15,15,8,6,6',
+    ),
+    inference_type='ddim'
+)

data/database/t2m_text_train.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae3575b686e29623f9e1715345b052726650f53c5bfcc770d9fb87a827a60249
+size 1462801786

data/datasets/human_ml3d/mean.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d73483a5b53e017b4044fe363164d7c185082a02ae7f69525ea70c5ccfd4a85
+size 1180

data/datasets/human_ml3d/std.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a6d720e004b6da18e8033d739de6078cbc7c1c8fad0ff62eee86f173e4430a2
+size 1180

data/datasets/kit_ml/mean.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e23fac51db2215ab5666324226be48f27efd6a6e7b22ebd17c28e0f056a7c22
+size 2136

data/datasets/kit_ml/std.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:296a60656cea07e65ee64512d73d47c0412df0698b35194116330661be32fa90
+size 2136

logs/mdm/mdm_t2m/latest.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8810255fb8df9eed6211537de9826f07ff73862f367cbf91532d84fd4c9a497e
+size 81791550

logs/motiondiffuse/motiondiffuse_t2m/latest.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:521baa6ba60865710bc75b99f393b133e45dc18083229a2258a16e5dc65f904a
+size 348728194

logs/remodiffuse/remodiffuse_t2m/latest.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aaa34b3328942769478e96283678424c95c4b817ca6f7162c4cf1fc512d4951b
+size 187939375

mogen/__init__.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import warnings
+import mmcv
+from packaging.version import parse
+from .version import __version__
+def digit_version(version_str: str, length: int = 4):
+    """Convert a version string into a tuple of integers.
+    This method is usually used for comparing two versions. For pre-release
+    versions: alpha < beta < rc.
+    Args:
+        version_str (str): The version string.
+        length (int): The maximum number of version levels. Default: 4.
+    Returns:
+        tuple[int]: The version info in digits (integers).
+    """
+    version = parse(version_str)
+    assert version.release, f'failed to parse version {version_str}'
+    release = list(version.release)
+    release = release[:length]
+    if len(release) < length:
+        release = release + [0] * (length - len(release))
+    if version.is_prerelease:
+        mapping = {'a': -3, 'b': -2, 'rc': -1}
+        val = -4
+        # version.pre can be None
+        if version.pre:
+            if version.pre[0] not in mapping:
+                warnings.warn(f'unknown prerelease version {version.pre[0]}, '
+                              'version checking may go wrong')
+            else:
+                val = mapping[version.pre[0]]
+            release.extend([val, version.pre[-1]])
+        else:
+            release.extend([val, 0])
+    elif version.is_postrelease:
+        release.extend([1, version.post])
+    else:
+        release.extend([0, 0])
+    return tuple(release)
+mmcv_minimum_version = '1.4.2'
+mmcv_maximum_version = '1.9.0'
+mmcv_version = digit_version(mmcv.__version__)
+assert (mmcv_version >= digit_version(mmcv_minimum_version)
+        and mmcv_version <= digit_version(mmcv_maximum_version)), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.'
+__all__ = ['__version__', 'digit_version']

mogen/apis/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from mogen.apis import test, train
+from mogen.apis.test import (
+    collect_results_cpu,
+    collect_results_gpu,
+    multi_gpu_test,
+    single_gpu_test,
+)
+from mogen.apis.train import set_random_seed, train_model
+__all__ = [
+    'collect_results_cpu', 'collect_results_gpu', 'multi_gpu_test',
+    'single_gpu_test', 'set_random_seed', 'train_model'
+]

mogen/apis/test.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+import mmcv
+import torch
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+def single_gpu_test(model, data_loader):
+    """Test with single gpu."""
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    prog_bar = mmcv.ProgressBar(len(dataset))
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, **data)
+        batch_size = len(result)
+        if isinstance(result, list):
+            results.extend(result)
+        else:
+            results.append(result)
+        batch_size = data['motion'].size(0)
+        for _ in range(batch_size):
+            prog_bar.update()
+    return results
+def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+    """Test model with multiple gpus.
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+    it encodes results to gpu tensors and use gpu communication for results
+    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+    and collects them by the rank 0 worker.
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        # Check if tmpdir is valid for cpu_collect
+        if (not gpu_collect) and (tmpdir is not None and osp.exists(tmpdir)):
+            raise OSError((f'The tmpdir {tmpdir} already exists.',
+                           ' Since tmpdir will be deleted after testing,',
+                           ' please make sure you specify an empty one.'))
+        prog_bar = mmcv.ProgressBar(len(dataset))
+    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, **data)
+        if isinstance(result, list):
+            results.extend(result)
+        else:
+            results.append(result)
+        if rank == 0:
+            batch_size = data['motion'].size(0)
+            for _ in range(batch_size * world_size):
+                prog_bar.update()
+    # collect results from all ranks
+    if gpu_collect:
+        results = collect_results_gpu(results, len(dataset))
+    else:
+        results = collect_results_cpu(results, len(dataset), tmpdir)
+    return results
+def collect_results_cpu(result_part, size, tmpdir=None):
+    """Collect results in cpu."""
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            mmcv.mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mmcv.mkdir_or_exist(tmpdir)
+    # dump the part result to the dir
+    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            part_result = mmcv.load(part_file)
+            part_list.append(part_result)
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)
+        return ordered_results
+def collect_results_gpu(result_part, size):
+    """Collect results in gpu."""
+    rank, world_size = get_dist_info()
+    # dump result part to tensor with pickle
+    part_tensor = torch.tensor(
+        bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda')
+    # gather all result part tensor shape
+    shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
+    shape_list = [shape_tensor.clone() for _ in range(world_size)]
+    dist.all_gather(shape_list, shape_tensor)
+    # padding result part tensor to max length
+    shape_max = torch.tensor(shape_list).max()
+    part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
+    part_send[:shape_tensor[0]] = part_tensor
+    part_recv_list = [
+        part_tensor.new_zeros(shape_max) for _ in range(world_size)
+    ]
+    # gather all result part
+    dist.all_gather(part_recv_list, part_send)
+    if rank == 0:
+        part_list = []
+        for recv, shape in zip(part_recv_list, shape_list):
+            part_result = pickle.loads(recv[:shape[0]].cpu().numpy().tobytes())
+            part_list.append(part_result)
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        return ordered_results

mogen/apis/train.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import random
+import warnings
+import numpy as np
+import torch
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (
+    DistSamplerSeedHook,
+    Fp16OptimizerHook,
+    OptimizerHook,
+    build_runner,
+)
+from mogen.core.distributed_wrapper import DistributedDataParallelWrapper
+from mogen.core.evaluation import DistEvalHook, EvalHook
+from mogen.core.optimizer import build_optimizers
+from mogen.datasets import build_dataloader, build_dataset
+from mogen.utils import get_root_logger
+def set_random_seed(seed, deterministic=False):
+    """Set random seed.
+    Args:
+        seed (int): Seed to be used.
+        deterministic (bool): Whether to set the deterministic option for
+            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+            to True and `torch.backends.cudnn.benchmark` to False.
+            Default: False.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+def train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                device='cuda',
+                meta=None):
+    """Main api for training model."""
+    logger = get_root_logger(cfg.log_level)
+    # prepare data loaders
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    data_loaders = [
+        build_dataloader(
+            ds,
+            cfg.data.samples_per_gpu,
+            cfg.data.workers_per_gpu,
+            # cfg.gpus will be ignored if distributed
+            num_gpus=len(cfg.gpu_ids),
+            dist=distributed,
+            round_up=True,
+            seed=cfg.seed) for ds in dataset
+    ]
+    # determine whether use adversarial training precess or not
+    use_adverserial_train = cfg.get('use_adversarial_train', False)
+    # put model on gpus
+    if distributed:
+        find_unused_parameters = cfg.get('find_unused_parameters', True)
+        # Sets the `find_unused_parameters` parameter in
+        # torch.nn.parallel.DistributedDataParallel
+        if use_adverserial_train:
+            # Use DistributedDataParallelWrapper for adversarial training
+            model = DistributedDataParallelWrapper(
+                model,
+                device_ids=[torch.cuda.current_device()],
+                broadcast_buffers=False,
+                find_unused_parameters=find_unused_parameters)
+        else:
+            model = MMDistributedDataParallel(
+                model.cuda(),
+                device_ids=[torch.cuda.current_device()],
+                broadcast_buffers=False,
+                find_unused_parameters=find_unused_parameters)
+    else:
+        if device == 'cuda':
+            model = MMDataParallel(
+                model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+        elif device == 'cpu':
+            model = model.cpu()
+        else:
+            raise ValueError(F'unsupported device name {device}.')
+    # build runner
+    optimizer = build_optimizers(model, cfg.optimizer)
+    if cfg.get('runner') is None:
+        cfg.runner = {
+            'type': 'EpochBasedRunner',
+            'max_epochs': cfg.total_epochs
+        }
+        warnings.warn(
+            'config is now expected to have a `runner` section, '
+            'please set `runner` in your config.', UserWarning)
+    runner = build_runner(
+        cfg.runner,
+        default_args=dict(
+            model=model,
+            batch_processor=None,
+            optimizer=optimizer,
+            work_dir=cfg.work_dir,
+            logger=logger,
+            meta=meta))
+    # an ugly walkaround to make the .log and .log.json filenames the same
+    runner.timestamp = timestamp
+    if use_adverserial_train:
+        # The optimizer step process is included in the train_step function
+        # of the model, so the runner should NOT include optimizer hook.
+        optimizer_config = None
+    else:
+        # fp16 setting
+        fp16_cfg = cfg.get('fp16', None)
+        if fp16_cfg is not None:
+            optimizer_config = Fp16OptimizerHook(
+                **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
+        elif distributed and 'type' not in cfg.optimizer_config:
+            optimizer_config = OptimizerHook(**cfg.optimizer_config)
+        else:
+            optimizer_config = cfg.optimizer_config
+    # register hooks
+    runner.register_training_hooks(
+        cfg.lr_config,
+        optimizer_config,
+        cfg.checkpoint_config,
+        cfg.log_config,
+        cfg.get('momentum_config', None),
+        custom_hooks_config=cfg.get('custom_hooks', None))
+    if distributed:
+        runner.register_hook(DistSamplerSeedHook())
+    # register eval hooks
+    if validate:
+        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
+        val_dataloader = build_dataloader(
+            val_dataset,
+            samples_per_gpu=cfg.data.samples_per_gpu,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            dist=distributed,
+            shuffle=False,
+            round_up=True)
+        eval_cfg = cfg.get('evaluation', {})
+        eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
+        eval_hook = DistEvalHook if distributed else EvalHook
+        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow)

mogen/core/__init__.py ADDED Viewed

File without changes

mogen/core/distributed_wrapper.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.parallel import MODULE_WRAPPERS, MMDistributedDataParallel
+from mmcv.parallel.scatter_gather import scatter_kwargs
+from torch.cuda._utils import _get_device_index
+@MODULE_WRAPPERS.register_module()
+class DistributedDataParallelWrapper(nn.Module):
+    """A DistributedDataParallel wrapper for models in 3D mesh estimation task.
+    In  3D mesh estimation task, there is a need to wrap different modules in
+    the models with separate DistributedDataParallel. Otherwise, it will cause
+    errors for GAN training.
+    More specific, the GAN model, usually has two sub-modules:
+    generator and discriminator. If we wrap both of them in one
+    standard DistributedDataParallel, it will cause errors during training,
+    because when we update the parameters of the generator (or discriminator),
+    the parameters of the discriminator (or generator) is not updated, which is
+    not allowed for DistributedDataParallel.
+    So we design this wrapper to separately wrap DistributedDataParallel
+    for generator and discriminator.
+    In this wrapper, we perform two operations:
+    1. Wrap the modules in the models with separate MMDistributedDataParallel.
+        Note that only modules with parameters will be wrapped.
+    2. Do scatter operation for 'forward', 'train_step' and 'val_step'.
+    Note that the arguments of this wrapper is the same as those in
+    `torch.nn.parallel.distributed.DistributedDataParallel`.
+    Args:
+        module (nn.Module): Module that needs to be wrapped.
+        device_ids (list[int | `torch.device`]): Same as that in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+        dim (int, optional): Same as that in the official scatter function in
+            pytorch. Defaults to 0.
+        broadcast_buffers (bool): Same as that in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+            Defaults to False.
+        find_unused_parameters (bool, optional): Same as that in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+            Traverse the autograd graph of all tensors contained in returned
+            value of the wrapped module’s forward function. Defaults to False.
+        kwargs (dict): Other arguments used in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+    """
+    def __init__(self,
+                 module,
+                 device_ids,
+                 dim=0,
+                 broadcast_buffers=False,
+                 find_unused_parameters=False,
+                 **kwargs):
+        super().__init__()
+        assert len(device_ids) == 1, (
+            'Currently, DistributedDataParallelWrapper only supports one'
+            'single CUDA device for each process.'
+            f'The length of device_ids must be 1, but got {len(device_ids)}.')
+        self.module = module
+        self.dim = dim
+        self.to_ddp(
+            device_ids=device_ids,
+            dim=dim,
+            broadcast_buffers=broadcast_buffers,
+            find_unused_parameters=find_unused_parameters,
+            **kwargs)
+        self.output_device = _get_device_index(device_ids[0], True)
+    def to_ddp(self, device_ids, dim, broadcast_buffers,
+               find_unused_parameters, **kwargs):
+        """Wrap models with separate MMDistributedDataParallel.
+        It only wraps the modules with parameters.
+        """
+        for name, module in self.module._modules.items():
+            if next(module.parameters(), None) is None:
+                module = module.cuda()
+            elif all(not p.requires_grad for p in module.parameters()):
+                module = module.cuda()
+            else:
+                module = MMDistributedDataParallel(
+                    module.cuda(),
+                    device_ids=device_ids,
+                    dim=dim,
+                    broadcast_buffers=broadcast_buffers,
+                    find_unused_parameters=find_unused_parameters,
+                    **kwargs)
+            self.module._modules[name] = module
+    def scatter(self, inputs, kwargs, device_ids):
+        """Scatter function.
+        Args:
+            inputs (Tensor): Input Tensor.
+            kwargs (dict): Args for
+                ``mmcv.parallel.scatter_gather.scatter_kwargs``.
+            device_ids (int): Device id.
+        """
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+    def forward(self, *inputs, **kwargs):
+        """Forward function.
+        Args:
+            inputs (tuple): Input data.
+            kwargs (dict): Args for
+                ``mmcv.parallel.scatter_gather.scatter_kwargs``.
+        """
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        return self.module(*inputs[0], **kwargs[0])
+    def train_step(self, *inputs, **kwargs):
+        """Train step function.
+        Args:
+            inputs (Tensor): Input Tensor.
+            kwargs (dict): Args for
+                ``mmcv.parallel.scatter_gather.scatter_kwargs``.
+        """
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        output = self.module.train_step(*inputs[0], **kwargs[0])
+        return output
+    def val_step(self, *inputs, **kwargs):
+        """Validation step function.
+        Args:
+            inputs (tuple): Input data.
+            kwargs (dict): Args for ``scatter_kwargs``.
+        """
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        output = self.module.val_step(*inputs[0], **kwargs[0])
+        return output

mogen/core/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mogen.core.evaluation.eval_hooks import DistEvalHook, EvalHook
+from mogen.core.evaluation.builder import build_evaluator
+__all__ = ["DistEvalHook", "EvalHook", "build_evaluator"]

mogen/core/evaluation/builder.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import copy
+import numpy as np
+from mmcv.utils import Registry
+from .evaluators.precision_evaluator import PrecisionEvaluator
+from .evaluators.matching_score_evaluator import MatchingScoreEvaluator
+from .evaluators.fid_evaluator import FIDEvaluator
+from .evaluators.diversity_evaluator import DiversityEvaluator
+from .evaluators.multimodality_evaluator import MultiModalityEvaluator
+EVALUATORS = Registry('evaluators')
+EVALUATORS.register_module(name='R Precision', module=PrecisionEvaluator)
+EVALUATORS.register_module(name='Matching Score', module=MatchingScoreEvaluator)
+EVALUATORS.register_module(name='FID', module=FIDEvaluator)
+EVALUATORS.register_module(name='Diversity', module=DiversityEvaluator)
+EVALUATORS.register_module(name='MultiModality', module=MultiModalityEvaluator)
+def build_evaluator(metric, eval_cfg, data_len, eval_indexes):
+    cfg = copy.deepcopy(eval_cfg)
+    cfg.update(metric)
+    cfg.pop('metrics')
+    cfg['data_len'] = data_len
+    cfg['eval_indexes'] = eval_indexes
+    evaluator = EVALUATORS.build(cfg)
+    if evaluator.append_indexes is not None:
+        for i in range(eval_cfg['replication_times']):
+            eval_indexes[i] = np.concatenate((eval_indexes[i], evaluator.append_indexes[i]), axis=0)
+    return evaluator, eval_indexes

mogen/core/evaluation/eval_hooks.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import tempfile
+import warnings
+from mmcv.runner import DistEvalHook as BaseDistEvalHook
+from mmcv.runner import EvalHook as BaseEvalHook
+mogen_GREATER_KEYS = []
+mogen_LESS_KEYS = []
+class EvalHook(BaseEvalHook):
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 by_epoch=True,
+                 save_best=None,
+                 rule=None,
+                 test_fn=None,
+                 greater_keys=mogen_GREATER_KEYS,
+                 less_keys=mogen_LESS_KEYS,
+                 **eval_kwargs):
+        if test_fn is None:
+            from mogen.apis import single_gpu_test
+            test_fn = single_gpu_test
+        # remove "gpu_collect" from eval_kwargs
+        if 'gpu_collect' in eval_kwargs:
+            warnings.warn(
+                '"gpu_collect" will be deprecated in EvalHook.'
+                'Please remove it from the config.', DeprecationWarning)
+            _ = eval_kwargs.pop('gpu_collect')
+        # update "save_best" according to "key_indicator" and remove the
+        # latter from eval_kwargs
+        if 'key_indicator' in eval_kwargs or isinstance(save_best, bool):
+            warnings.warn(
+                '"key_indicator" will be deprecated in EvalHook.'
+                'Please use "save_best" to specify the metric key,'
+                'e.g., save_best="pa-mpjpe".', DeprecationWarning)
+            key_indicator = eval_kwargs.pop('key_indicator', None)
+            if save_best is True and key_indicator is None:
+                raise ValueError('key_indicator should not be None, when '
+                                 'save_best is set to True.')
+            save_best = key_indicator
+        super().__init__(dataloader, start, interval, by_epoch, save_best,
+                         rule, test_fn, greater_keys, less_keys, **eval_kwargs)
+    def evaluate(self, runner, results):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            eval_res = self.dataloader.dataset.evaluate(
+                results,
+                work_dir=tmp_dir,
+                logger=runner.logger,
+                **self.eval_kwargs)
+        for name, val in eval_res.items():
+            runner.log_buffer.output[name] = val
+        runner.log_buffer.ready = True
+        if self.save_best is not None:
+            if self.key_indicator == 'auto':
+                self._init_rule(self.rule, list(eval_res.keys())[0])
+            return eval_res[self.key_indicator]
+        return None
+class DistEvalHook(BaseDistEvalHook):
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 by_epoch=True,
+                 save_best=None,
+                 rule=None,
+                 test_fn=None,
+                 greater_keys=mogen_GREATER_KEYS,
+                 less_keys=mogen_LESS_KEYS,
+                 broadcast_bn_buffer=True,
+                 tmpdir=None,
+                 gpu_collect=False,
+                 **eval_kwargs):
+        if test_fn is None:
+            from mogen.apis import multi_gpu_test
+            test_fn = multi_gpu_test
+        # update "save_best" according to "key_indicator" and remove the
+        # latter from eval_kwargs
+        if 'key_indicator' in eval_kwargs or isinstance(save_best, bool):
+            warnings.warn(
+                '"key_indicator" will be deprecated in EvalHook.'
+                'Please use "save_best" to specify the metric key,'
+                'e.g., save_best="pa-mpjpe".', DeprecationWarning)
+            key_indicator = eval_kwargs.pop('key_indicator', None)
+            if save_best is True and key_indicator is None:
+                raise ValueError('key_indicator should not be None, when '
+                                 'save_best is set to True.')
+            save_best = key_indicator
+        super().__init__(dataloader, start, interval, by_epoch, save_best,
+                         rule, test_fn, greater_keys, less_keys,
+                         broadcast_bn_buffer, tmpdir, gpu_collect,
+                         **eval_kwargs)
+    def evaluate(self, runner, results):
+        """Evaluate the results.
+        Args:
+            runner (:obj:`mmcv.Runner`): The underlined training runner.
+            results (list): Output results.
+        """
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            eval_res = self.dataloader.dataset.evaluate(
+                results,
+                work_dir=tmp_dir,
+                logger=runner.logger,
+                **self.eval_kwargs)
+        for name, val in eval_res.items():
+            runner.log_buffer.output[name] = val
+        runner.log_buffer.ready = True
+        if self.save_best is not None:
+            if self.key_indicator == 'auto':
+                # infer from eval_results
+                self._init_rule(self.rule, list(eval_res.keys())[0])
+            return eval_res[self.key_indicator]
+        return None

mogen/core/evaluation/evaluators/__init__.py ADDED Viewed

File without changes

mogen/core/evaluation/evaluators/base_evaluator.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import torch
+import numpy as np
+from ..utils import get_metric_statistics
+class BaseEvaluator(object):
+    def __init__(self,
+                 batch_size=None,
+                 drop_last=False,
+                 replication_times=1,
+                 replication_reduction='statistics',
+                 eval_begin_idx=None,
+                 eval_end_idx=None):
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        self.replication_times = replication_times
+        self.replication_reduction = replication_reduction
+        assert replication_reduction in ['statistics', 'mean', 'concat']
+        self.eval_begin_idx = eval_begin_idx
+        self.eval_end_idx = eval_end_idx
+    def evaluate(self, results):
+        total_len = len(results)
+        partial_len = total_len // self.replication_times
+        all_metrics = []
+        for replication_idx in range(self.replication_times):
+            partial_results = results[
+                replication_idx * partial_len: (replication_idx + 1) * partial_len]
+            if self.batch_size is not None:
+                batch_metrics = []
+                for batch_start in range(self.eval_begin_idx, self.eval_end_idx, self.batch_size):
+                    batch_results = partial_results[batch_start: batch_start + self.batch_size]
+                    if len(batch_results) < self.batch_size and self.drop_last:
+                        continue
+                    batch_metrics.append(self.single_evaluate(batch_results))
+                all_metrics.append(self.concat_batch_metrics(batch_metrics))
+            else:
+                batch_results = partial_results[self.eval_begin_idx: self.eval_end_idx]
+                all_metrics.append(self.single_evaluate(batch_results))
+        all_metrics = np.stack(all_metrics, axis=0)
+        if self.replication_reduction == 'statistics':
+            values = get_metric_statistics(all_metrics, self.replication_times)
+        elif self.replication_reduction == 'mean':
+            values = np.mean(all_metrics, axis=0)
+        elif self.replication_reduction == 'concat':
+            values = all_metrics
+        return self.parse_values(values)
+    def prepare_results(self, results):
+        text = []
+        pred_motion = []
+        pred_motion_length = []
+        pred_motion_mask = []
+        motion = []
+        motion_length = []
+        motion_mask = []
+        token = []
+        # count the maximum motion length
+        T = max([result['motion'].shape[0] for result in results])
+        for result in results:
+            cur_motion = result['motion']
+            if cur_motion.shape[0] < T:
+                padding_values = torch.zeros((T - cur_motion.shape[0], cur_motion.shape[1]))
+                padding_values = padding_values.type_as(pred_motion)
+                cur_motion = torch.cat([cur_motion, padding_values], dim=0)
+            motion.append(cur_motion)
+            cur_pred_motion = result['pred_motion']
+            if cur_pred_motion.shape[0] < T:
+                padding_values = torch.zeros((T - cur_pred_motion.shape[0], cur_pred_motion.shape[1]))
+                padding_values = padding_values.type_as(cur_pred_motion)
+                cur_pred_motion = torch.cat([cur_pred_motion, padding_values], dim=0)
+            pred_motion.append(cur_pred_motion)
+            cur_motion_mask = result['motion_mask']
+            if cur_motion_mask.shape[0] < T:
+                padding_values = torch.zeros((T - cur_motion_mask.shape[0]))
+                padding_values = padding_values.type_as(cur_motion_mask)
+                cur_motion_mask= torch.cat([cur_motion_mask, padding_values], dim=0)
+            motion_mask.append(cur_motion_mask)
+            cur_pred_motion_mask = result['pred_motion_mask']
+            if cur_pred_motion_mask.shape[0] < T:
+                padding_values = torch.zeros((T - cur_pred_motion_mask.shape[0]))
+                padding_values = padding_values.type_as(cur_pred_motion_mask)
+                cur_pred_motion_mask= torch.cat([cur_pred_motion_mask, padding_values], dim=0)
+            pred_motion_mask.append(cur_pred_motion_mask)
+            motion_length.append(result['motion_length'].item())
+            pred_motion_length.append(result['pred_motion_length'].item())
+            if 'text' in result.keys():
+                text.append(result['text'])
+            if 'token' in result.keys():
+                token.append(result['token'])
+        motion = torch.stack(motion, dim=0)
+        pred_motion = torch.stack(pred_motion, dim=0)
+        motion_mask = torch.stack(motion_mask, dim=0)
+        pred_motion_mask = torch.stack(pred_motion_mask, dim=0)
+        motion_length = torch.Tensor(motion_length).to(motion.device).long()
+        pred_motion_length = torch.Tensor(pred_motion_length).to(motion.device).long()
+        output = {
+            'pred_motion': pred_motion,
+            'pred_motion_mask': pred_motion_mask,
+            'pred_motion_length': pred_motion_length,
+            'motion': motion,
+            'motion_mask': motion_mask,
+            'motion_length': motion_length,
+            'text': text,
+            'token': token
+        }
+        return output
+    def to_device(self, device):
+        for model in self.model_list:
+            model.to(device)
+    def motion_encode(self, motion, motion_length, motion_mask, device):
+        N = motion.shape[0]
+        motion_emb = []
+        batch_size = 32
+        cur_idx = 0
+        with torch.no_grad():
+            while cur_idx < N:
+                cur_motion = motion[cur_idx: cur_idx + batch_size].to(device)
+                cur_motion_length = motion_length[cur_idx: cur_idx + batch_size].to(device)
+                cur_motion_mask = motion_mask[cur_idx: cur_idx + batch_size].to(device)
+                cur_motion_emb = self.motion_encoder(cur_motion, cur_motion_length, cur_motion_mask)
+                motion_emb.append(cur_motion_emb)
+                cur_idx += batch_size
+        motion_emb = torch.cat(motion_emb, dim=0)
+        return motion_emb
+    def text_encode(self, text, token, device):
+        N = len(text)
+        text_emb = []
+        batch_size = 32
+        cur_idx = 0
+        with torch.no_grad():
+            while cur_idx < N:
+                cur_text = text[cur_idx: cur_idx + batch_size]
+                cur_token = token[cur_idx: cur_idx + batch_size]
+                cur_text_emb = self.text_encoder(cur_text, cur_token, device)
+                text_emb.append(cur_text_emb)
+                cur_idx += batch_size
+        text_emb = torch.cat(text_emb, dim=0)
+        return text_emb

mogen/core/evaluation/evaluators/diversity_evaluator.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import numpy as np
+import torch
+from ..get_model import get_motion_model
+from .base_evaluator import BaseEvaluator
+from ..utils import calculate_diversity
+class DiversityEvaluator(BaseEvaluator):
+    def __init__(self,
+                 data_len=0,
+                 motion_encoder_name=None,
+                 motion_encoder_path=None,
+                 num_samples=300,
+                 batch_size=None,
+                 drop_last=False,
+                 replication_times=1,
+                 replication_reduction='statistics',
+                 **kwargs):
+        super().__init__(
+            replication_times=replication_times,
+            replication_reduction=replication_reduction,
+            batch_size=batch_size,
+            drop_last=drop_last,
+            eval_begin_idx=0,
+            eval_end_idx=data_len
+        )
+        self.num_samples = num_samples
+        self.append_indexes = None
+        self.motion_encoder = get_motion_model(motion_encoder_name, motion_encoder_path)
+        self.model_list = [self.motion_encoder]
+    def single_evaluate(self, results):
+        results = self.prepare_results(results)
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        motion = results['motion']
+        pred_motion = results['pred_motion']
+        pred_motion_length = results['pred_motion_length']
+        pred_motion_mask = results['pred_motion_mask']
+        self.motion_encoder.to(device)
+        self.motion_encoder.eval()
+        with torch.no_grad():
+            pred_motion_emb = self.motion_encode(pred_motion, pred_motion_length, pred_motion_mask, device).cpu().detach().numpy()
+            diversity = calculate_diversity(pred_motion_emb, self.num_samples)
+        return diversity
+    def parse_values(self, values):
+        metrics = {}
+        metrics['Diversity (mean)'] = values[0]
+        metrics['Diversity (conf)'] = values[1]
+        return metrics

mogen/core/evaluation/evaluators/fid_evaluator.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import numpy as np
+import torch
+from ..get_model import get_motion_model
+from .base_evaluator import BaseEvaluator
+from ..utils import (
+    calculate_activation_statistics,
+    calculate_frechet_distance)
+class FIDEvaluator(BaseEvaluator):
+    def __init__(self,
+                 data_len=0,
+                 motion_encoder_name=None,
+                 motion_encoder_path=None,
+                 batch_size=None,
+                 drop_last=False,
+                 replication_times=1,
+                 replication_reduction='statistics',
+                 **kwargs):
+        super().__init__(
+            replication_times=replication_times,
+            replication_reduction=replication_reduction,
+            batch_size=batch_size,
+            drop_last=drop_last,
+            eval_begin_idx=0,
+            eval_end_idx=data_len
+        )
+        self.append_indexes = None
+        self.motion_encoder = get_motion_model(motion_encoder_name, motion_encoder_path)
+        self.model_list = [self.motion_encoder]
+    def single_evaluate(self, results):
+        results = self.prepare_results(results)
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        pred_motion = results['pred_motion']
+        pred_motion_length = results['pred_motion_length']
+        pred_motion_mask = results['pred_motion_mask']
+        motion = results['motion']
+        motion_length = results['motion_length']
+        motion_mask = results['motion_mask']
+        self.motion_encoder.to(device)
+        self.motion_encoder.eval()
+        with torch.no_grad():
+            pred_motion_emb = self.motion_encode(pred_motion, pred_motion_length, pred_motion_mask, device).cpu().detach().numpy()
+            gt_motion_emb = self.motion_encode(motion, motion_length, motion_mask, device).cpu().detach().numpy()
+        gt_mu, gt_cov = calculate_activation_statistics(gt_motion_emb)
+        pred_mu, pred_cov = calculate_activation_statistics(pred_motion_emb)
+        fid = calculate_frechet_distance(gt_mu, gt_cov, pred_mu, pred_cov)
+        return fid
+    def parse_values(self, values):
+        metrics = {}
+        metrics['FID (mean)'] = values[0]
+        metrics['FID (conf)'] = values[1]
+        return metrics

mogen/core/evaluation/evaluators/matching_score_evaluator.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import numpy as np
+import torch
+from ..get_model import get_motion_model, get_text_model
+from .base_evaluator import BaseEvaluator
+from ..utils import calculate_top_k, euclidean_distance_matrix
+class MatchingScoreEvaluator(BaseEvaluator):
+    def __init__(self,
+                 data_len=0,
+                 text_encoder_name=None,
+                 text_encoder_path=None,
+                 motion_encoder_name=None,
+                 motion_encoder_path=None,
+                 top_k=3,
+                 batch_size=32,
+                 drop_last=False,
+                 replication_times=1,
+                 replication_reduction='statistics',
+                 **kwargs):
+        super().__init__(
+            replication_times=replication_times,
+            replication_reduction=replication_reduction,
+            batch_size=batch_size,
+            drop_last=drop_last,
+            eval_begin_idx=0,
+            eval_end_idx=data_len
+        )
+        self.append_indexes = None
+        self.text_encoder = get_text_model(text_encoder_name, text_encoder_path)
+        self.motion_encoder = get_motion_model(motion_encoder_name, motion_encoder_path)
+        self.top_k = top_k
+        self.model_list = [self.text_encoder, self.motion_encoder]
+    def single_evaluate(self, results):
+        results = self.prepare_results(results)
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        motion = results['motion']
+        pred_motion = results['pred_motion']
+        pred_motion_length = results['pred_motion_length']
+        pred_motion_mask = results['pred_motion_mask']
+        text = results['text']
+        token = results['token']
+        self.text_encoder.to(device)
+        self.motion_encoder.to(device)
+        self.text_encoder.eval()
+        self.motion_encoder.eval()
+        with torch.no_grad():
+            word_emb = self.text_encode(text, token, device=device).cpu().detach().numpy()
+            motion_emb = self.motion_encode(pred_motion, pred_motion_length, pred_motion_mask, device).cpu().detach().numpy()
+            dist_mat = euclidean_distance_matrix(word_emb, motion_emb)
+            matching_score = dist_mat.trace()
+            all_size = word_emb.shape[0]
+        return matching_score, all_size
+    def concat_batch_metrics(self, batch_metrics):
+        matching_score_sum = 0
+        all_size = 0
+        for batch_matching_score, batch_all_size in batch_metrics:
+            matching_score_sum += batch_matching_score
+            all_size += batch_all_size
+        matching_score = matching_score_sum / all_size
+        return matching_score
+    def parse_values(self, values):
+        metrics = {}
+        metrics['Matching Score (mean)'] = values[0]
+        metrics['Matching Score (conf)'] = values[1]
+        return metrics

mogen/core/evaluation/evaluators/multimodality_evaluator.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import numpy as np
+import torch
+from ..get_model import get_motion_model
+from .base_evaluator import BaseEvaluator
+from ..utils import calculate_multimodality
+class MultiModalityEvaluator(BaseEvaluator):
+    def __init__(self,
+                 data_len=0,
+                 motion_encoder_name=None,
+                 motion_encoder_path=None,
+                 num_samples=100,
+                 num_repeats=30,
+                 num_picks=10,
+                 batch_size=None,
+                 drop_last=False,
+                 replication_times=1,
+                 replication_reduction='statistics',
+                 **kwargs):
+        super().__init__(
+            replication_times=replication_times,
+            replication_reduction=replication_reduction,
+            batch_size=batch_size,
+            drop_last=drop_last,
+            eval_begin_idx=data_len,
+            eval_end_idx=data_len + num_samples * num_repeats
+        )
+        self.num_samples = num_samples
+        self.num_repeats = num_repeats
+        self.num_picks = num_picks
+        self.append_indexes = []
+        for i in range(replication_times):
+            append_indexes = []
+            selected_indexs = np.random.choice(data_len, self.num_samples)
+            for index in selected_indexs:
+                append_indexes = append_indexes + [index]  * self.num_repeats
+            self.append_indexes.append(np.array(append_indexes))
+        self.motion_encoder = get_motion_model(motion_encoder_name, motion_encoder_path)
+        self.model_list = [self.motion_encoder]
+    def single_evaluate(self, results):
+        results = self.prepare_results(results)
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        motion = results['motion']
+        pred_motion = results['pred_motion']
+        pred_motion_length = results['pred_motion_length']
+        pred_motion_mask = results['pred_motion_mask']
+        self.motion_encoder.to(device)
+        self.motion_encoder.eval()
+        with torch.no_grad():
+            pred_motion_emb = self.motion_encode(pred_motion, pred_motion_length, pred_motion_mask, device).cpu().detach().numpy()
+        pred_motion_emb = pred_motion_emb.reshape((self.num_samples, self.num_repeats, -1))
+        multimodality = calculate_multimodality(pred_motion_emb, self.num_picks)
+        return multimodality
+    def parse_values(self, values):
+        metrics = {}
+        metrics['MultiModality (mean)'] = values[0]
+        metrics['MultiModality (conf)'] = values[1]
+        return metrics

mogen/core/evaluation/evaluators/precision_evaluator.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import numpy as np
+import torch
+from ..get_model import get_motion_model, get_text_model
+from .base_evaluator import BaseEvaluator
+from ..utils import calculate_top_k, euclidean_distance_matrix
+class PrecisionEvaluator(BaseEvaluator):
+    def __init__(self,
+                 data_len=0,
+                 text_encoder_name=None,
+                 text_encoder_path=None,
+                 motion_encoder_name=None,
+                 motion_encoder_path=None,
+                 top_k=3,
+                 batch_size=32,
+                 drop_last=False,
+                 replication_times=1,
+                 replication_reduction='statistics',
+                 **kwargs):
+        super().__init__(
+            replication_times=replication_times,
+            replication_reduction=replication_reduction,
+            batch_size=batch_size,
+            drop_last=drop_last,
+            eval_begin_idx=0,
+            eval_end_idx=data_len
+        )
+        self.append_indexes = None
+        self.text_encoder = get_text_model(text_encoder_name, text_encoder_path)
+        self.motion_encoder = get_motion_model(motion_encoder_name, motion_encoder_path)
+        self.top_k = top_k
+        self.model_list = [self.text_encoder, self.motion_encoder]
+    def single_evaluate(self, results):
+        results = self.prepare_results(results)
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        motion = results['motion']
+        pred_motion = results['pred_motion']
+        pred_motion_length = results['pred_motion_length']
+        pred_motion_mask = results['pred_motion_mask']
+        text = results['text']
+        token = results['token']
+        self.text_encoder.to(device)
+        self.motion_encoder.to(device)
+        self.text_encoder.eval()
+        self.motion_encoder.eval()
+        with torch.no_grad():
+            word_emb = self.text_encode(text, token, device=device).cpu().detach().numpy()
+            motion_emb = self.motion_encode(pred_motion, pred_motion_length, pred_motion_mask, device).cpu().detach().numpy()
+            dist_mat = euclidean_distance_matrix(word_emb, motion_emb)
+            argsmax = np.argsort(dist_mat, axis=1)
+            top_k_mat = calculate_top_k(argsmax, top_k=self.top_k)
+            top_k_count = top_k_mat.sum(axis=0)
+            all_size = word_emb.shape[0]
+        return top_k_count, all_size
+    def concat_batch_metrics(self, batch_metrics):
+        top_k_count = 0
+        all_size = 0
+        for batch_top_k_count, batch_all_size in batch_metrics:
+            top_k_count += batch_top_k_count
+            all_size += batch_all_size
+        R_precision = top_k_count / all_size
+        return R_precision
+    def parse_values(self, values):
+        metrics = {}
+        for top_k in range(self.top_k):
+            metrics['R_precision Top %d (mean)' % (top_k + 1)] = values[0][top_k]
+            metrics['R_precision Top %d (conf)' % (top_k + 1)] = values[1][top_k]
+        return metrics

mogen/core/evaluation/get_model.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from mogen.models import build_submodule
+def get_motion_model(name, ckpt_path):
+    if name == 'kit_ml':
+        model = build_submodule(dict(
+            type='T2MMotionEncoder',
+            input_size=251,
+            movement_hidden_size=512,
+            movement_latent_size=512,
+            motion_hidden_size=1024,
+            motion_latent_size=512,
+        ))
+    else:
+        model = build_submodule(dict(
+            type='T2MMotionEncoder',
+            input_size=263,
+            movement_hidden_size=512,
+            movement_latent_size=512,
+            motion_hidden_size=1024,
+            motion_latent_size=512,
+        ))
+    model.load_pretrained(ckpt_path)
+    return model
+def get_text_model(name, ckpt_path):
+    if name == 'kit_ml':
+        model = build_submodule(dict(
+            type='T2MTextEncoder',
+            word_size=300,
+            pos_size=15,
+            hidden_size=512,
+            output_size=512,
+            max_text_len=20
+        ))
+    else:
+        model = build_submodule(dict(
+            type='T2MTextEncoder',
+            word_size=300,
+            pos_size=15,
+            hidden_size=512,
+            output_size=512,
+            max_text_len=20
+        ))
+    model.load_pretrained(ckpt_path)
+    return model

mogen/core/evaluation/utils.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import numpy as np
+from scipy import linalg
+def get_metric_statistics(values, replication_times):
+    mean = np.mean(values, axis=0)
+    std = np.std(values, axis=0)
+    conf_interval = 1.96 * std / np.sqrt(replication_times)
+    return mean, conf_interval
+# (X - X_train)*(X - X_train) = -2X*X_train + X*X + X_train*X_train
+def euclidean_distance_matrix(matrix1, matrix2):
+    """
+        Params:
+        -- matrix1: N1 x D
+        -- matrix2: N2 x D
+        Returns:
+        -- dist: N1 x N2
+        dist[i, j] == distance(matrix1[i], matrix2[j])
+    """
+    assert matrix1.shape[1] == matrix2.shape[1]
+    d1 = -2 * np.dot(matrix1, matrix2.T)    # shape (num_test, num_train)
+    d2 = np.sum(np.square(matrix1), axis=1, keepdims=True)    # shape (num_test, 1)
+    d3 = np.sum(np.square(matrix2), axis=1)     # shape (num_train, )
+    dists = np.sqrt(d1 + d2 + d3)  # broadcasting
+    return dists
+def calculate_top_k(mat, top_k):
+    size = mat.shape[0]
+    gt_mat = np.expand_dims(np.arange(size), 1).repeat(size, 1)
+    bool_mat = (mat == gt_mat)
+    correct_vec = False
+    top_k_list = []
+    for i in range(top_k):
+#         print(correct_vec, bool_mat[:, i])
+        correct_vec = (correct_vec | bool_mat[:, i])
+        # print(correct_vec)
+        top_k_list.append(correct_vec[:, None])
+    top_k_mat = np.concatenate(top_k_list, axis=1)
+    return top_k_mat
+def calculate_activation_statistics(activations):
+    """
+    Params:
+    -- activation: num_samples x dim_feat
+    Returns:
+    -- mu: dim_feat
+    -- sigma: dim_feat x dim_feat
+    """
+    mu = np.mean(activations, axis=0)
+    cov = np.cov(activations, rowvar=False)
+    return mu, cov
+def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    """Numpy implementation of the Frechet Distance.
+    The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
+    and X_2 ~ N(mu_2, C_2) is
+            d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
+    Stable version by Dougal J. Sutherland.
+    Params:
+    -- mu1   : Numpy array containing the activations of a layer of the
+               inception net (like returned by the function 'get_predictions')
+               for generated samples.
+    -- mu2   : The sample mean over activations, precalculated on an
+               representative data set.
+    -- sigma1: The covariance matrix over activations for generated samples.
+    -- sigma2: The covariance matrix over activations, precalculated on an
+               representative data set.
+    Returns:
+    --   : The Frechet Distance.
+    """
+    mu1 = np.atleast_1d(mu1)
+    mu2 = np.atleast_1d(mu2)
+    sigma1 = np.atleast_2d(sigma1)
+    sigma2 = np.atleast_2d(sigma2)
+    assert mu1.shape == mu2.shape, \
+        'Training and test mean vectors have different lengths'
+    assert sigma1.shape == sigma2.shape, \
+        'Training and test covariances have different dimensions'
+    diff = mu1 - mu2
+    # Product might be almost singular
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        msg = ('fid calculation produces singular product; '
+               'adding %s to diagonal of cov estimates') % eps
+        print(msg)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+    # Numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError('Imaginary component {}'.format(m))
+        covmean = covmean.real
+    tr_covmean = np.trace(covmean)
+    return (diff.dot(diff) + np.trace(sigma1) +
+            np.trace(sigma2) - 2 * tr_covmean)
+def calculate_diversity(activation, diversity_times):
+    assert len(activation.shape) == 2
+    assert activation.shape[0] > diversity_times
+    num_samples = activation.shape[0]
+    first_indices = np.random.choice(num_samples, diversity_times, replace=False)
+    second_indices = np.random.choice(num_samples, diversity_times, replace=False)
+    dist = linalg.norm(activation[first_indices] - activation[second_indices], axis=1)
+    return dist.mean()
+def calculate_multimodality(activation, multimodality_times):
+    assert len(activation.shape) == 3
+    assert activation.shape[1] > multimodality_times
+    num_per_sent = activation.shape[1]
+    first_dices = np.random.choice(num_per_sent, multimodality_times, replace=False)
+    second_dices = np.random.choice(num_per_sent, multimodality_times, replace=False)
+    dist = linalg.norm(activation[:, first_dices] - activation[:, second_dices], axis=2)
+    return dist.mean()

mogen/core/optimizer/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .builder import OPTIMIZERS, build_optimizers
2	+
3	+ __all__ = ['build_optimizers', 'OPTIMIZERS']

mogen/core/optimizer/builder.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.runner import build_optimizer
+from mmcv.utils import Registry
+OPTIMIZERS = Registry('optimizers')
+def build_optimizers(model, cfgs):
+    """Build multiple optimizers from configs. If `cfgs` contains several dicts
+    for optimizers, then a dict for each constructed optimizers will be
+    returned. If `cfgs` only contains one optimizer config, the constructed
+    optimizer itself will be returned. For example,
+    1) Multiple optimizer configs:
+    .. code-block:: python
+        optimizer_cfg = dict(
+            model1=dict(type='SGD', lr=lr),
+            model2=dict(type='SGD', lr=lr))
+    The return dict is
+    ``dict('model1': torch.optim.Optimizer, 'model2': torch.optim.Optimizer)``
+    2) Single optimizer config:
+    .. code-block:: python
+        optimizer_cfg = dict(type='SGD', lr=lr)
+    The return is ``torch.optim.Optimizer``.
+    Args:
+        model (:obj:`nn.Module`): The model with parameters to be optimized.
+        cfgs (dict): The config dict of the optimizer.
+    Returns:
+        dict[:obj:`torch.optim.Optimizer`] | :obj:`torch.optim.Optimizer`:
+            The initialized optimizers.
+    """
+    optimizers = {}
+    if hasattr(model, 'module'):
+        model = model.module
+    # determine whether 'cfgs' has several dicts for optimizers
+    if all(isinstance(v, dict) for v in cfgs.values()):
+        for key, cfg in cfgs.items():
+            cfg_ = cfg.copy()
+            module = getattr(model, key)
+            optimizers[key] = build_optimizer(module, cfg_)
+        return optimizers
+    return build_optimizer(model, cfgs)

mogen/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from .base_dataset import BaseMotionDataset
+from .text_motion_dataset import TextMotionDataset
+from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset
+from .pipelines import Compose
+from .samplers import DistributedSampler
+__all__ = [
+    'BaseMotionDataset', 'TextMotionDataset', 'DATASETS', 'PIPELINES', 'build_dataloader',
+    'build_dataset', 'Compose', 'DistributedSampler'
+]

mogen/datasets/base_dataset.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+import copy
+from typing import Optional, Union
+import numpy as np
+from torch.utils.data import Dataset
+from .pipelines import Compose
+from .builder import DATASETS
+from mogen.core.evaluation import build_evaluator
+@DATASETS.register_module()
+class BaseMotionDataset(Dataset):
+    """Base motion dataset.
+    Args:
+        data_prefix (str): the prefix of data path.
+        pipeline (list): a list of dict, where each element represents
+            a operation defined in `mogen.datasets.pipelines`.
+        ann_file (str | None, optional): the annotation file. When ann_file is
+            str, the subclass is expected to read from the ann_file. When
+            ann_file is None, the subclass is expected to read according
+            to data_prefix.
+        test_mode (bool): in train mode or test mode. Default: None.
+        dataset_name (str | None, optional): the name of dataset. It is used
+            to identify the type of evaluation metric. Default: None.
+    """
+    def __init__(self,
+                 data_prefix: str,
+                 pipeline: list,
+                 dataset_name: Optional[Union[str, None]] = None,
+                 fixed_length: Optional[Union[int, None]] = None,
+                 ann_file: Optional[Union[str, None]] = None,
+                 motion_dir: Optional[Union[str, None]] = None,
+                 eval_cfg: Optional[Union[dict, None]] = None,
+                 test_mode: Optional[bool] = False):
+        super(BaseMotionDataset, self).__init__()
+        self.data_prefix = data_prefix
+        self.pipeline = Compose(pipeline)
+        self.dataset_name = dataset_name
+        self.fixed_length = fixed_length
+        self.ann_file = os.path.join(data_prefix, 'datasets', dataset_name, ann_file)
+        self.motion_dir = os.path.join(data_prefix, 'datasets', dataset_name, motion_dir)
+        self.eval_cfg = copy.deepcopy(eval_cfg)
+        self.test_mode = test_mode
+        self.load_annotations()
+        if self.test_mode:
+            self.prepare_evaluation()
+    def load_anno(self, name):
+        motion_path = os.path.join(self.motion_dir, name + '.npy')
+        motion_data = np.load(motion_path)
+        return {'motion': motion_data}
+    def load_annotations(self):
+        """Load annotations from ``ann_file`` to ``data_infos``"""
+        self.data_infos = []
+        for line in open(self.ann_file, 'r').readlines():
+            line = line.strip()
+            self.data_infos.append(self.load_anno(line))
+    def prepare_data(self, idx: int):
+        """"Prepare raw data for the f'{idx'}-th data."""
+        results = copy.deepcopy(self.data_infos[idx])
+        results['dataset_name'] = self.dataset_name
+        results['sample_idx'] = idx
+        return self.pipeline(results)
+    def __len__(self):
+        """Return the length of current dataset."""
+        if self.test_mode:
+            return len(self.eval_indexes)
+        elif self.fixed_length is not None:
+            return self.fixed_length
+        return len(self.data_infos)
+    def __getitem__(self, idx: int):
+        """Prepare data for the ``idx``-th data.
+        As for video dataset, we can first parse raw data for each frame. Then
+        we combine annotations from all frames. This interface is used to
+        simplify the logic of video dataset and other special datasets.
+        """
+        if self.test_mode:
+            idx = self.eval_indexes[idx]
+        elif self.fixed_length is not None:
+            idx = idx % len(self.data_infos)
+        return self.prepare_data(idx)
+    def prepare_evaluation(self):
+        self.evaluators = []
+        self.eval_indexes = []
+        for _ in range(self.eval_cfg['replication_times']):
+            eval_indexes = np.arange(len(self.data_infos))
+            if self.eval_cfg.get('shuffle_indexes', False):
+                np.random.shuffle(eval_indexes)
+            self.eval_indexes.append(eval_indexes)
+        for metric in self.eval_cfg['metrics']:
+            evaluator, self.eval_indexes = build_evaluator(
+                metric, self.eval_cfg, len(self.data_infos), self.eval_indexes)
+            self.evaluators.append(evaluator)
+        self.eval_indexes = np.concatenate(self.eval_indexes)
+    def evaluate(self, results, work_dir, logger=None):
+        metrics = {}
+        device = results[0]['motion'].device
+        for evaluator in self.evaluators:
+            evaluator.to_device(device)
+            metrics.update(evaluator.evaluate(results))
+        if logger is not None:
+            logger.info(metrics)
+        return metrics

mogen/datasets/builder.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import platform
+import random
+from functools import partial
+from typing import Optional, Union
+import numpy as np
+from mmcv.parallel import collate
+from mmcv.runner import get_dist_info
+from mmcv.utils import Registry, build_from_cfg
+from torch.utils.data import DataLoader
+from torch.utils.data.dataset import Dataset
+from .samplers import DistributedSampler
+if platform.system() != 'Windows':
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    base_soft_limit = rlimit[0]
+    hard_limit = rlimit[1]
+    soft_limit = min(max(4096, base_soft_limit), hard_limit)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
+DATASETS = Registry('dataset')
+PIPELINES = Registry('pipeline')
+def build_dataset(cfg: Union[dict, list, tuple],
+                  default_args: Optional[Union[dict, None]] = None):
+    """"Build dataset by the given config."""
+    from .dataset_wrappers import (
+        ConcatDataset,
+        RepeatDataset,
+    )
+    if isinstance(cfg, (list, tuple)):
+        dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
+    elif cfg['type'] == 'RepeatDataset':
+        dataset = RepeatDataset(
+            build_dataset(cfg['dataset'], default_args), cfg['times'])
+    else:
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+    return dataset
+def build_dataloader(dataset: Dataset,
+                     samples_per_gpu: int,
+                     workers_per_gpu: int,
+                     num_gpus: Optional[int] = 1,
+                     dist: Optional[bool] = True,
+                     shuffle: Optional[bool] = True,
+                     round_up: Optional[bool] = True,
+                     seed: Optional[Union[int, None]] = None,
+                     persistent_workers: Optional[bool] = True,
+                     **kwargs):
+    """Build PyTorch DataLoader.
+    In distributed training, each GPU/process has a dataloader.
+    In non-distributed training, there is only one dataloader for all GPUs.
+    Args:
+        dataset (:obj:`Dataset`): A PyTorch dataset.
+        samples_per_gpu (int): Number of training samples on each GPU, i.e.,
+            batch size of each GPU.
+        workers_per_gpu (int): How many subprocesses to use for data loading
+            for each GPU.
+        num_gpus (int, optional): Number of GPUs. Only used in non-distributed
+            training.
+        dist (bool, optional): Distributed training/test or not. Default: True.
+        shuffle (bool, optional): Whether to shuffle the data at every epoch.
+            Default: True.
+        round_up (bool, optional): Whether to round up the length of dataset by
+            adding extra samples to make it evenly divisible. Default: True.
+        kwargs: any keyword argument to be used to initialize DataLoader
+    Returns:
+        DataLoader: A PyTorch dataloader.
+    """
+    rank, world_size = get_dist_info()
+    if dist:
+        sampler = DistributedSampler(
+            dataset, world_size, rank, shuffle=shuffle, round_up=round_up)
+        shuffle = False
+        batch_size = samples_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        sampler = None
+        batch_size = num_gpus * samples_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+    init_fn = partial(
+        worker_init_fn, num_workers=num_workers, rank=rank,
+        seed=seed) if seed is not None else None
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=num_workers,
+        collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
+        pin_memory=False,
+        shuffle=shuffle,
+        worker_init_fn=init_fn,
+        persistent_workers=persistent_workers,
+        **kwargs)
+    return data_loader
+def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int):
+    """Init random seed for each worker."""
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)

mogen/datasets/dataset_wrappers.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
+from torch.utils.data.dataset import Dataset
+from .builder import DATASETS
+@DATASETS.register_module()
+class ConcatDataset(_ConcatDataset):
+    """A wrapper of concatenated dataset.
+    Same as :obj:`torch.utils.data.dataset.ConcatDataset`, but
+    add `get_cat_ids` function.
+    Args:
+        datasets (list[:obj:`Dataset`]): A list of datasets.
+    """
+    def __init__(self, datasets: list):
+        super(ConcatDataset, self).__init__(datasets)
+@DATASETS.register_module()
+class RepeatDataset(object):
+    """A wrapper of repeated dataset.
+    The length of repeated dataset will be `times` larger than the original
+    dataset. This is useful when the data loading time is long but the dataset
+    is small. Using RepeatDataset can reduce the data loading time between
+    epochs.
+    Args:
+        dataset (:obj:`Dataset`): The dataset to be repeated.
+        times (int): Repeat times.
+    """
+    def __init__(self, dataset: Dataset, times: int):
+        self.dataset = dataset
+        self.times = times
+        self._ori_len = len(self.dataset)
+    def __getitem__(self, idx: int):
+        return self.dataset[idx % self._ori_len]
+    def __len__(self):
+        return self.times * self._ori_len

mogen/datasets/pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from .compose import Compose
+from .formatting import (
+    to_tensor,
+    ToTensor,
+    Transpose,
+    Collect,
+    WrapFieldsToLists
+)
+from .transforms import (
+    Crop,
+    RandomCrop,
+    Normalize
+)
+__all__ = [
+    'Compose', 'to_tensor', 'Transpose', 'Collect', 'WrapFieldsToLists', 'ToTensor',
+    'Crop', 'RandomCrop', 'Normalize'
+]

mogen/datasets/pipelines/compose.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from collections.abc import Sequence
+from mmcv.utils import build_from_cfg
+from ..builder import PIPELINES
+@PIPELINES.register_module()
+class Compose(object):
+    """Compose a data pipeline with a sequence of transforms.
+    Args:
+        transforms (list[dict | callable]):
+            Either config dicts of transforms or transform objects.
+    """
+    def __init__(self, transforms):
+        assert isinstance(transforms, Sequence)
+        self.transforms = []
+        for transform in transforms:
+            if isinstance(transform, dict):
+                transform = build_from_cfg(transform, PIPELINES)
+                self.transforms.append(transform)
+            elif callable(transform):
+                self.transforms.append(transform)
+            else:
+                raise TypeError('transform must be callable or a dict, but got'
+                                f' {type(transform)}')
+    def __call__(self, data):
+        for t in self.transforms:
+            data = t(data)
+            if data is None:
+                return None
+        return data
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += f'\n    {t}'
+        format_string += '\n)'
+        return format_string

mogen/datasets/pipelines/formatting.py ADDED Viewed

	@@ -0,0 +1,134 @@

+from collections.abc import Sequence
+import mmcv
+import numpy as np
+import torch
+from mmcv.parallel import DataContainer as DC
+from PIL import Image
+from ..builder import PIPELINES
+def to_tensor(data):
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+    """
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmcv.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(
+            f'Type {type(data)} cannot be converted to tensor.'
+            'Supported types are: `numpy.ndarray`, `torch.Tensor`, '
+            '`Sequence`, `int` and `float`')
+@PIPELINES.register_module()
+class ToTensor(object):
+    def __init__(self, keys):
+        self.keys = keys
+    def __call__(self, results):
+        for key in self.keys:
+            results[key] = to_tensor(results[key])
+        return results
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+@PIPELINES.register_module()
+class Transpose(object):
+    def __init__(self, keys, order):
+        self.keys = keys
+        self.order = order
+    def __call__(self, results):
+        for key in self.keys:
+            results[key] = results[key].transpose(self.order)
+        return results
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, order={self.order})'
+@PIPELINES.register_module()
+class Collect(object):
+    """Collect data from the loader relevant to the specific task.
+    This is usually the last stage of the data loader pipeline.
+    Args:
+        keys (Sequence[str]): Keys of results to be collected in ``data``.
+        meta_keys (Sequence[str], optional): Meta keys to be converted to
+            ``mmcv.DataContainer`` and collected in ``data[motion_metas]``.
+            Default: ``('filename', 'ori_filename', 'ori_shape', 'motion_shape', 'motion_mask')``
+    Returns:
+        dict: The result dict contains the following keys
+                - keys in``self.keys``
+                - ``motion_metas`` if available
+    """
+    def __init__(self,
+                 keys,
+                 meta_keys=('filename', 'ori_filename', 'ori_shape', 'motion_shape', 'motion_mask')):
+        self.keys = keys
+        self.meta_keys = meta_keys
+    def __call__(self, results):
+        data = {}
+        motion_meta = {}
+        for key in self.meta_keys:
+            if key in results:
+                motion_meta[key] = results[key]
+        data['motion_metas'] = DC(motion_meta, cpu_only=True)
+        for key in self.keys:
+            data[key] = results[key]
+        return data
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, meta_keys={self.meta_keys})'
+@PIPELINES.register_module()
+class WrapFieldsToLists(object):
+    """Wrap fields of the data dictionary into lists for evaluation.
+    This class can be used as a last step of a test or validation
+    pipeline for single image evaluation or inference.
+    Example:
+        >>> test_pipeline = [
+        >>>    dict(type='LoadImageFromFile'),
+        >>>    dict(type='Normalize',
+                    mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True),
+        >>>    dict(type='ImageToTensor', keys=['img']),
+        >>>    dict(type='Collect', keys=['img']),
+        >>>    dict(type='WrapIntoLists')
+        >>> ]
+    """
+    def __call__(self, results):
+        # Wrap dict fields into lists
+        for key, val in results.items():
+            results[key] = [val]
+        return results
+    def __repr__(self):
+        return f'{self.__class__.__name__}()'

mogen/datasets/pipelines/transforms.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import math
+import random
+import mmcv
+import numpy as np
+from ..builder import PIPELINES
+import torch
+from typing import Optional, Tuple, Union
+@PIPELINES.register_module()
+class Crop(object):
+    r"""Crop motion sequences.
+    Args:
+        crop_size (int): The size of the cropped motion sequence.
+    """
+    def __init__(self,
+                 crop_size: Optional[Union[int, None]] = None):
+        self.crop_size = crop_size
+        assert self.crop_size is not None
+    def __call__(self, results):
+        motion = results['motion']
+        length = len(motion)
+        if length >= self.crop_size:
+            idx = random.randint(0, length - self.crop_size)
+            motion = motion[idx: idx + self.crop_size]
+            results['motion_length'] = self.crop_size
+        else:
+            padding_length = self.crop_size - length
+            D = motion.shape[1:]
+            padding_zeros = np.zeros((padding_length, *D), dtype=np.float32)
+            motion = np.concatenate([motion, padding_zeros], axis=0)
+            results['motion_length'] = length
+        assert len(motion) == self.crop_size
+        results['motion'] = motion
+        results['motion_shape'] = motion.shape
+        if length >= self.crop_size:
+            results['motion_mask'] = torch.ones(self.crop_size).numpy()
+        else:
+            results['motion_mask'] = torch.cat(
+                (torch.ones(length), torch.zeros(self.crop_size - length))).numpy()
+        return results
+    def __repr__(self):
+        repr_str = self.__class__.__name__ + f'(crop_size={self.crop_size})'
+        return repr_str
+@PIPELINES.register_module()
+class RandomCrop(object):
+    r"""Random crop motion sequences. Each sequence will be padded with zeros to the maximum length.
+    Args:
+        min_size (int or None): The minimum size of the cropped motion sequence (inclusive).
+        max_size (int or None): The maximum size of the cropped motion sequence (inclusive).
+    """
+    def __init__(self,
+                 min_size: Optional[Union[int, None]] = None,
+                 max_size: Optional[Union[int, None]] = None):
+        self.min_size = min_size
+        self.max_size = max_size
+        assert self.min_size is not None
+        assert self.max_size is not None
+    def __call__(self, results):
+        motion = results['motion']
+        length = len(motion)
+        crop_size = random.randint(self.min_size, self.max_size)
+        if length > crop_size:
+            idx = random.randint(0, length - crop_size)
+            motion = motion[idx: idx + crop_size]
+            results['motion_length'] = crop_size
+        else:
+            results['motion_length'] = length
+        padding_length = self.max_size - min(crop_size, length)
+        if padding_length > 0:
+            D = motion.shape[1:]
+            padding_zeros = np.zeros((padding_length, *D), dtype=np.float32)
+            motion = np.concatenate([motion, padding_zeros], axis=0)
+        results['motion'] = motion
+        results['motion_shape'] = motion.shape
+        if length >= self.max_size and crop_size == self.max_size:
+            results['motion_mask'] = torch.ones(self.max_size).numpy()
+        else:
+            results['motion_mask'] = torch.cat((
+                torch.ones(min(length, crop_size)),
+                torch.zeros(self.max_size - min(length, crop_size))), dim=0).numpy()
+        assert len(motion) == self.max_size
+        return results
+    def __repr__(self):
+        repr_str = self.__class__.__name__ + f'(min_size={self.min_size}'
+        repr_str += f', max_size={self.max_size})'
+        return repr_str
+@PIPELINES.register_module()
+class Normalize(object):
+    """Normalize motion sequences.
+    Args:
+        mean_path (str): Path of mean file.
+        std_path (str): Path of std file.
+    """
+    def __init__(self, mean_path, std_path, eps=1e-9):
+        self.mean = np.load(mean_path)
+        self.std = np.load(std_path)
+        self.eps = eps
+    def __call__(self, results):
+        motion = results['motion']
+        motion = (motion - self.mean) / (self.std + self.eps)
+        results['motion'] = motion
+        results['motion_norm_mean'] = self.mean
+        results['motion_norm_std'] = self.std
+        return results

mogen/datasets/samplers/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .distributed_sampler import DistributedSampler
2	+
3	+ __all__ = ['DistributedSampler']

mogen/datasets/samplers/distributed_sampler.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+from torch.utils.data import DistributedSampler as _DistributedSampler
+class DistributedSampler(_DistributedSampler):
+    def __init__(self,
+                 dataset,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=True,
+                 round_up=True):
+        super().__init__(dataset, num_replicas=num_replicas, rank=rank)
+        self.shuffle = shuffle
+        self.round_up = round_up
+        if self.round_up:
+            self.total_size = self.num_samples * self.num_replicas
+        else:
+            self.total_size = len(self.dataset)
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+        # add extra samples to make it evenly divisible
+        if self.round_up:
+            indices = (
+                indices *
+                int(self.total_size / len(indices) + 1))[:self.total_size]
+        assert len(indices) == self.total_size
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        if self.round_up:
+            assert len(indices) == self.num_samples
+        return iter(indices)

mogen/datasets/text_motion_dataset.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import json
+import os
+import os.path
+from abc import ABCMeta
+from collections import OrderedDict
+from typing import Any, List, Optional, Union
+import mmcv
+import copy
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+from .base_dataset import BaseMotionDataset
+from .builder import DATASETS
+@DATASETS.register_module()
+class TextMotionDataset(BaseMotionDataset):
+    """TextMotion dataset.
+    Args:
+        text_dir (str): Path to the directory containing the text files.
+    """
+    def __init__(self,
+                 data_prefix: str,
+                 pipeline: list,
+                 dataset_name: Optional[Union[str, None]] = None,
+                 fixed_length: Optional[Union[int, None]] = None,
+                 ann_file: Optional[Union[str, None]] = None,
+                 motion_dir: Optional[Union[str, None]] = None,
+                 text_dir: Optional[Union[str, None]] = None,
+                 token_dir: Optional[Union[str, None]] = None,
+                 clip_feat_dir: Optional[Union[str, None]] = None,
+                 eval_cfg: Optional[Union[dict, None]] = None,
+                 fine_mode: Optional[bool] = False,
+                 test_mode: Optional[bool] = False):
+        self.text_dir = os.path.join(data_prefix, 'datasets', dataset_name, text_dir)
+        if token_dir is not None:
+            self.token_dir = os.path.join(data_prefix, 'datasets', dataset_name, token_dir)
+        else:
+            self.token_dir = None
+        if clip_feat_dir is not None:
+            self.clip_feat_dir = os.path.join(data_prefix, 'datasets', dataset_name, clip_feat_dir)
+        else:
+            self.clip_feat_dir = None
+        self.fine_mode = fine_mode
+        super(TextMotionDataset, self).__init__(
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            dataset_name=dataset_name,
+            fixed_length=fixed_length,
+            ann_file=ann_file,
+            motion_dir=motion_dir,
+            eval_cfg=eval_cfg,
+            test_mode=test_mode)
+    def load_anno(self, name):
+        results = super().load_anno(name)
+        text_path = os.path.join(self.text_dir, name + '.txt')
+        text_data = []
+        for line in open(text_path, 'r'):
+            text_data.append(line.strip())
+        results['text'] = text_data
+        if self.token_dir is not None:
+            token_path = os.path.join(self.token_dir, name + '.txt')
+            token_data = []
+            for line in open(token_path, 'r'):
+                token_data.append(line.strip())
+            results['token'] = token_data
+        if self.clip_feat_dir is not None:
+            clip_feat_path = os.path.join(self.clip_feat_dir, name + '.npy')
+            clip_feat = torch.from_numpy(np.load(clip_feat_path))
+            results['clip_feat'] = clip_feat
+        return results
+    def prepare_data(self, idx: int):
+        """"Prepare raw data for the f'{idx'}-th data."""
+        results = copy.deepcopy(self.data_infos[idx])
+        text_list = results['text']
+        idx = np.random.randint(0, len(text_list))
+        if self.fine_mode:
+            results['text'] = json.loads(text_list[idx])
+        else:
+            results['text'] = text_list[idx]
+        if 'clip_feat' in results.keys():
+            results['clip_feat'] = results['clip_feat'][idx]
+        if 'token' in results.keys():
+            results['token'] = results['token'][idx]
+        results['dataset_name'] = self.dataset_name
+        results['sample_idx'] = idx
+        return self.pipeline(results)

mogen/models/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .architectures import *
+from .losses import *
+from .rnns import *
+from .transformers import *
+from .attentions import *
+from .builder import *
+from .utils import *