MotionGPT

Runtime error

App Files Files Community

bill-jiang commited on Sep 12, 2023

Commit

4409449

1 Parent(s): a0563b6

Init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +165 -0
README.md +4 -4
app.py +511 -0
assets/css/custom.css +359 -0
assets/images/avatar_bot.jpg +0 -0
assets/meta/mean.npy +3 -0
assets/meta/mean_eval.npy +3 -0
assets/meta/std.npy +3 -0
assets/meta/std_eval.npy +3 -0
assets/videos/m2t_0.mp4 +0 -0
assets/videos/t2m_0.mp4 +0 -0
configs/assets.yaml +32 -0
configs/default.yaml +141 -0
configs/evaluator/tm2t.yaml +19 -0
configs/lm/default.yaml +7 -0
configs/render.yaml +23 -0
configs/vq/default.yaml +15 -0
configs/webui.yaml +74 -0
deps/smpl/smpl_models/SMPL_downsample_index.pkl +3 -0
deps/smpl/smpl_models/gmm_08.pkl +3 -0
deps/smpl/smpl_models/neutral_smpl_mean_params.h5 +3 -0
deps/smpl/smpl_models/smpl.faces +0 -0
deps/smpl/smpl_models/smpl.tar.gz +3 -0
deps/smpl/smpl_models/smpl/SMPL_FEMALE.pkl +3 -0
deps/smpl/smpl_models/smpl/SMPL_MALE.pkl +3 -0
deps/smpl/smpl_models/smpl/SMPL_NEUTRAL.pkl +3 -0
deps/smpl/smpl_models/smpl/readme.txt +1 -0
deps/smpl/smpl_models/smplh/SMPLH_FEMALE.npz +3 -0
deps/smpl/smpl_models/smplh/SMPLH_MALE.npz +3 -0
deps/smpl/smpl_models/smplh/SMPLH_NEUTRAL.npz +3 -0
deps/smpl/smpl_models/smplh/mano_v1_2.zip +3 -0
deps/smpl/smpl_models/smplh/smplh.faces +0 -0
deps/smpl/smpl_models/smplh/smplh.tar.xz +3 -0
deps/smpl/smpl_models/smplx_parts_segm.pkl +3 -0
mGPT/__init__.py +0 -0
mGPT/archs/__init__.py +0 -0
mGPT/archs/mgpt_lm.py +592 -0
mGPT/archs/mgpt_vq.py +190 -0
mGPT/archs/tm2t_evaluator.py +111 -0
mGPT/archs/tools/embeddings.py +322 -0
mGPT/archs/tools/quantize_cnn.py +414 -0
mGPT/archs/tools/resnet.py +82 -0
mGPT/archs/tools/token_emb.py +73 -0
mGPT/archs/tools/transformer_layers.py +285 -0
mGPT/callback.py +200 -0
mGPT/config.py +217 -0
mGPT/data/HumanML3D.py +117 -0
mGPT/data/Kit.py +88 -0
mGPT/data/__init__.py +103 -0
mGPT/data/build_data.py +15 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,165 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+.DS_Store
+pyglet
+app2.py
+render.py
+cache
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
 ---
 title: MotionGPT
-emoji: 🌖
-colorFrom: blue
-colorTo: indigo
 sdk: gradio
 sdk_version: 3.43.2
 app_file: app.py
 pinned: false
-license: cc
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: MotionGPT
+emoji: 🏃
+colorFrom: yellow
+colorTo: blue
 sdk: gradio
 sdk_version: 3.43.2
 app_file: app.py
 pinned: false
+license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,511 @@

+import gradio as gr
+import random
+import torch
+import time
+import cv2
+import os
+import numpy as np
+import OpenGL.GL as gl
+import pytorch_lightning as pl
+import moviepy.editor as mp
+from pathlib import Path
+from mGPT.data.build_data import build_data
+from mGPT.models.build_model import build_model
+from mGPT.config import parse_args
+from scipy.spatial.transform import Rotation as RRR
+import mGPT.render.matplot.plot_3d_global as plot_3d
+from mGPT.render.pyrender.hybrik_loc2rot import HybrIKJointsToRotmat
+from mGPT.render.pyrender.smpl_render import SMPLRender
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+import librosa
+from huggingface_hub import snapshot_download
+os.environ["PYOPENGL_PLATFORM"] = "egl"
+os.environ["MESA_GL_VERSION_OVERRIDE"] = "4.1"
+os.system('pip install /home/user/app/pyrender')
+# Load model
+cfg = parse_args(phase="webui")  # parse config file
+cfg.FOLDER = 'cache'
+output_dir = Path(cfg.FOLDER)
+output_dir.mkdir(parents=True, exist_ok=True)
+pl.seed_everything(cfg.SEED_VALUE)
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+else:
+    device = torch.device("cpu")
+model_path = snapshot_download(repo_id="bill-jiang/MotionGPT-base")
+datamodule = build_data(cfg, phase="test")
+model = build_model(cfg, datamodule)
+state_dict = torch.load(f'{model_path}/motiongpt_s3_h3d.tar',
+                        map_location="cpu")["state_dict"]
+model.load_state_dict(state_dict)
+model.to(device)
+audio_processor = WhisperProcessor.from_pretrained(cfg.model.whisper_path)
+audio_model = WhisperForConditionalGeneration.from_pretrained(
+    cfg.model.whisper_path).to(device)
+forced_decoder_ids_zh = audio_processor.get_decoder_prompt_ids(
+    language="zh", task="translate")
+forced_decoder_ids_en = audio_processor.get_decoder_prompt_ids(
+    language="en", task="translate")
+# HTML Style
+Video_Components = """
+<div class="side-video" style="position: relative;">
+    <video width="340" autoplay loop>
+        <source src="file/{video_path}" type="video/mp4">
+    </video>
+    <a class="videodl-button" href="file/{video_path}" download="{video_fname}" title="Download Video">
+        <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#000000" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-video"><path d="m22 8-6 4 6 4V8Z"/><rect width="14" height="12" x="2" y="6" rx="2" ry="2"/></svg>
+    </a>
+    <a class="npydl-button" href="file/{motion_path}" download="{motion_fname}" title="Download Motion">
+        <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#000000" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-file-box"><path d="M14.5 22H18a2 2 0 0 0 2-2V7.5L14.5 2H6a2 2 0 0 0-2 2v4"/><polyline points="14 2 14 8 20 8"/><path d="M2.97 13.12c-.6.36-.97 1.02-.97 1.74v3.28c0 .72.37 1.38.97 1.74l3 1.83c.63.39 1.43.39 2.06 0l3-1.83c.6-.36.97-1.02.97-1.74v-3.28c0-.72-.37-1.38-.97-1.74l-3-1.83a1.97 1.97 0 0 0-2.06 0l-3 1.83Z"/><path d="m7 17-4.74-2.85"/><path d="m7 17 4.74-2.85"/><path d="M7 17v5"/></svg>
+    </a>
+</div>
+"""
+Video_Components_example = """
+<div class="side-video" style="position: relative;">
+    <video width="340" autoplay loop controls>
+        <source src="file/{video_path}" type="video/mp4">
+    </video>
+    <a class="npydl-button" href="file/{video_path}" download="{video_fname}" title="Download Video">
+        <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-video"><path d="m22 8-6 4 6 4V8Z"/><rect width="14" height="12" x="2" y="6" rx="2" ry="2"/></svg>
+    </a>
+</div>
+"""
+Text_Components = """
+<h3 class="side-content" >{msg}</h3>
+"""
+def motion_token_to_string(motion_token, lengths, codebook_size=512):
+    motion_string = []
+    for i in range(motion_token.shape[0]):
+        motion_i = motion_token[i].cpu(
+        ) if motion_token.device.type == 'cuda' else motion_token[i]
+        motion_list = motion_i.tolist()[:lengths[i]]
+        motion_string.append(
+            (f'<motion_id_{codebook_size}>' +
+             ''.join([f'<motion_id_{int(i)}>' for i in motion_list]) +
+             f'<motion_id_{codebook_size + 1}>'))
+    return motion_string
+def render_motion(data, feats, method='fast'):
+    fname = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(
+        time.time())) + str(np.random.randint(10000, 99999))
+    video_fname = fname + '.mp4'
+    feats_fname = fname + '.npy'
+    output_npy_path = os.path.join(output_dir, feats_fname)
+    output_mp4_path = os.path.join(output_dir, video_fname)
+    np.save(output_npy_path, feats)
+    if method == 'slow':
+        if len(data.shape) == 4:
+            data = data[0]
+        data = data - data[0, 0]
+        pose_generator = HybrIKJointsToRotmat()
+        pose = pose_generator(data)
+        pose = np.concatenate([
+            pose,
+            np.stack([np.stack([np.eye(3)] * pose.shape[0], 0)] * 2, 1)
+        ], 1)
+        shape = [768, 768]
+        render = SMPLRender(cfg.RENDER.SMPL_MODEL_PATH)
+        if not os.environ.get("PYOPENGL_PLATFORM"):
+            os.environ["DISPLAY"] = ":0.0"
+            os.environ["PYOPENGL_PLATFORM"] = "egl"
+        size = (shape[1], shape[0])
+        fps = 20.0
+        fourcc = cv2.VideoWriter_fourcc('M', 'P', '4', 'V')
+        videoWriter = cv2.VideoWriter(output_mp4_path, fourcc, fps, size)
+        r = RRR.from_rotvec(np.array([np.pi, 0.0, 0.0]))
+        pose[:, 0] = np.matmul(r.as_matrix().reshape(1, 3, 3), pose[:, 0])
+        for i in range(data.shape[0]):
+            img = np.zeros([shape[0], shape[1], 3])
+            aroot = data[[i], 0] + np.array([[0.0, 0.0, 30.0]])
+            aroot[:, 1] = -aroot[:, 1]
+            params = dict(pred_shape=np.zeros([1, 10]),
+                          pred_root=aroot,
+                          pred_pose=pose[[i]])
+            renderImg = render.render(img.copy(), params)
+            renderImg = (renderImg * 255).astype(np.uint8)
+            videoWriter.write(renderImg)
+        videoWriter.release()
+        output_video_h264_name = output_mp4_path[:-4] + '_h264.mp4'
+        command = 'ffmpeg -y -i {} -vcodec h264 {}'.format(
+            output_mp4_path, output_video_h264_name)
+        os.system(command)
+        output_mp4_path = output_video_h264_name
+        video_fname = video_fname[:-4] + '_h264.mp4'
+    elif method == 'fast':
+        output_gif_path = output_mp4_path[:-4] + '.gif'
+        if len(data.shape) == 3:
+            data = data[None]
+        if isinstance(data, torch.Tensor):
+            data = data.cpu().numpy()
+        pose_vis = plot_3d.draw_to_batch(data, [''], [output_gif_path])
+        out_video = mp.VideoFileClip(output_gif_path)
+        out_video.write_videofile(output_mp4_path)
+    return output_mp4_path, video_fname, output_npy_path, feats_fname
+def load_motion(motion_uploaded, method):
+    file = motion_uploaded['file']
+    feats = torch.tensor(np.load(file), device=model.device)
+    if len(feats.shape) == 2:
+        feats = feats[None]
+    # feats = model.datamodule.normalize(feats)
+    # Motion tokens
+    motion_lengths = feats.shape[0]
+    motion_token, _ = model.vae.encode(feats)
+    motion_token_string = model.lm.motion_token_to_string(
+        motion_token, [motion_token.shape[1]])[0]
+    motion_token_length = motion_token.shape[1]
+    # Motion rendered
+    joints = model.datamodule.feats2joints(feats.cpu()).cpu().numpy()
+    output_mp4_path, video_fname, output_npy_path, joints_fname = render_motion(
+        joints,
+        feats.to('cpu').numpy(), method)
+    motion_uploaded.update({
+        "feats": feats,
+        "joints": joints,
+        "motion_video": output_mp4_path,
+        "motion_video_fname": video_fname,
+        "motion_joints": output_npy_path,
+        "motion_joints_fname": joints_fname,
+        "motion_lengths": motion_lengths,
+        "motion_token": motion_token,
+        "motion_token_string": motion_token_string,
+        "motion_token_length": motion_token_length,
+    })
+    return motion_uploaded
+def add_text(history, text, motion_uploaded, data_stored, method):
+    data_stored = data_stored + [{'user_input': text}]
+    text = f"""<h3>{text}</h3>"""
+    history = history + [(text, None)]
+    if 'file' in motion_uploaded.keys():
+        motion_uploaded = load_motion(motion_uploaded, method)
+        output_mp4_path = motion_uploaded['motion_video']
+        video_fname = motion_uploaded['motion_video_fname']
+        output_npy_path = motion_uploaded['motion_joints']
+        joints_fname = motion_uploaded['motion_joints_fname']
+        history = history + [(Video_Components.format(
+            video_path=output_mp4_path,
+            video_fname=video_fname,
+            motion_path=output_npy_path,
+            motion_fname=joints_fname), None)]
+    return history, gr.update(value="",
+                              interactive=False), motion_uploaded, data_stored
+def add_audio(history, audio_path, data_stored, language='en'):
+    audio, sampling_rate = librosa.load(audio_path, sr=16000)
+    input_features = audio_processor(
+        audio, sampling_rate, return_tensors="pt"
+    ).input_features  # whisper training sampling rate, do not modify
+    input_features = torch.Tensor(input_features).to(device)
+    if language == 'English':
+        forced_decoder_ids = forced_decoder_ids_en
+    else:
+        forced_decoder_ids = forced_decoder_ids_zh
+    predicted_ids = audio_model.generate(input_features,
+                                         forced_decoder_ids=forced_decoder_ids)
+    text_input = audio_processor.batch_decode(predicted_ids,
+                                              skip_special_tokens=True)
+    text_input = str(text_input).strip('[]"')
+    data_stored = data_stored + [{'user_input': text_input}]
+    gr.update(value=data_stored, interactive=False)
+    history = history + [(text_input, None)]
+    return history, data_stored
+def add_file(history, file, txt, motion_uploaded):
+    motion_uploaded['file'] = file.name
+    txt = txt.replace(" <Motion_Placeholder>", "") + " <Motion_Placeholder>"
+    return history, gr.update(value=txt, interactive=True), motion_uploaded
+def bot(history, motion_uploaded, data_stored, method):
+    motion_length, motion_token_string = motion_uploaded[
+        "motion_lengths"], motion_uploaded["motion_token_string"]
+    input = data_stored[-1]['user_input']
+    prompt = model.lm.placeholder_fulfill(input, motion_length,
+                                          motion_token_string, "")
+    data_stored[-1]['model_input'] = prompt
+    batch = {
+        "length": [motion_length],
+        "text": [prompt],
+    }
+    outputs = model(batch, task="t2m")
+    out_feats = outputs["feats"][0]
+    out_lengths = outputs["length"][0]
+    out_joints = outputs["joints"][:out_lengths].detach().cpu().numpy()
+    out_texts = outputs["texts"][0]
+    output_mp4_path, video_fname, output_npy_path, joints_fname = render_motion(
+        out_joints,
+        out_feats.to('cpu').numpy(), method)
+    motion_uploaded = {
+        "feats": None,
+        "joints": None,
+        "motion_video": None,
+        "motion_lengths": 0,
+        "motion_token": None,
+        "motion_token_string": '',
+        "motion_token_length": 0,
+    }
+    data_stored[-1]['model_output'] = {
+        "feats": out_feats,
+        "joints": out_joints,
+        "length": out_lengths,
+        "texts": out_texts,
+        "motion_video": output_mp4_path,
+        "motion_video_fname": video_fname,
+        "motion_joints": output_npy_path,
+        "motion_joints_fname": joints_fname,
+    }
+    if '<Motion_Placeholder>' == out_texts:
+        response = [
+            Video_Components.format(video_path=output_mp4_path,
+                                    video_fname=video_fname,
+                                    motion_path=output_npy_path,
+                                    motion_fname=joints_fname)
+        ]
+    elif '<Motion_Placeholder>' in out_texts:
+        response = [
+            Text_Components.format(
+                msg=out_texts.split("<Motion_Placeholder>")[0]),
+            Video_Components.format(video_path=output_mp4_path,
+                                    video_fname=video_fname,
+                                    motion_path=output_npy_path,
+                                    motion_fname=joints_fname),
+            Text_Components.format(
+                msg=out_texts.split("<Motion_Placeholder>")[1]),
+        ]
+    else:
+        response = f"""<h3>{out_texts}</h3>"""
+    history[-1][1] = ""
+    for character in response:
+        history[-1][1] += character
+        time.sleep(0.02)
+        yield history, motion_uploaded, data_stored
+def bot_example(history, responses):
+    for response in responses:
+        history[-1][1] = ""
+        for character in response:
+            history[-1][1] += character
+            time.sleep(0.02)
+            yield history, motion_uploaded, data_stored
+# Examples
+chat_instruct = [
+    (None,
+     "**👋 Hi, I'm MotionGPT! I can generate realistic human motion from text, or generate text from motion.**"
+     ),
+    (None,
+     "You can chat with me in pure text like generating human motion following your descriptions."
+     ),
+    (None,
+     "After generation, you can click the button in the top right of generation human motion result to download the human motion video or feature stored in .npy format."
+     ),
+    (None,
+     "With the human motion feature file downloaded or got from dataset, you are able to ask me to translate it!"
+     ),
+    (None,
+     "Of courser, you can also purely chat with me and let me give you human motion in text, here are some examples!"
+     ),
+    (None,
+     "We provide two motion visulization methods. The default fast method is skeleton line ploting which is like the examples below:"
+     ),
+    (None,
+     Video_Components_example.format(video_path="assets/videos/t2m_0.mp4",
+                                     video_fname="example1.mp4")),
+    (None,
+     "And the slow method is SMPL model rendering which is more realistic but slower."
+     ),
+    (None,
+     Video_Components_example.format(video_path="assets/videos/t2m_0.mp4",
+                                     video_fname="example1.mp4")),
+    (None, "👉 Follow the examples and try yourself!"),
+]
+t2m_examples = [
+    (None,
+     "You can chat with me in pure text, following are some examples of text-to-motion generation!"
+     ),
+    ("Generate a person is walking forwards, but stumbles and steps back, then carries on forward.",
+     Video_Components_example.format(video_path="assets/videos/t2m_0.mp4",
+                                     video_fname="example1.mp4")),
+    ("Generate a person is walking forwards, but stumbles and steps back, then carries on forward.",
+     Video_Components_example.format(video_path="assets/videos/t2m_0.mp4",
+                                     video_fname="example1.mp4")),
+    ("Generate a person is walking forwards, but stumbles and steps back, then carries on forward.",
+     Video_Components_example.format(video_path="assets/videos/t2m_0.mp4",
+                                     video_fname="example1.mp4")),
+]
+m2t_examples = [
+    (None,
+     "With the human motion feature file downloaded or got from dataset, you are able to ask me to translate it, here are some examples!"
+     ),
+    ("Please explain the movement shown in [Motion_tokens] using natural language.",
+     None),
+    (Video_Components_example.format(video_path="assets/videos/m2t_0.mp4",
+                                     video_fname="example2.mp4"),
+     "a person walks forward then does a backwards z-shape movement to its left side. then back to the right."
+     ),
+    ("Please explain the movement shown in [Motion_tokens] using natural language.",
+     None),
+    (Video_Components_example.format(video_path="assets/videos/m2t_0.mp4",
+                                     video_fname="example2.mp4"),
+     "a person walks forward then does a backwards z-shape movement to its left side. then back to the right."
+     ),
+]
+t2t_examples = [
+    (None,
+     "Of courser, you can also purely chat with me and let me give you human motion in text, here are some examples!"
+     ),
+    ('Depict a motion as like you have seen it.',
+     "The person walks while swaying their hips along a curved path to the left slowly then stops to look down at the edge of the grey platform at something."
+     ),
+    ('Depict a motion as like you have seen it.',
+     "The person walks while swaying their hips along a curved path to the left slowly then stops to look down at the edge of the grey platform at something."
+     ),
+]
+Init_chatbot = [
+    (None,
+     "**👋 Hi, I'm MotionGPT! I can generate realistic human motion from text, or generate text from motion.**"
+     )
+] + t2m_examples[:3] + m2t_examples[:2] + t2t_examples[:2] + chat_instruct[-4:]
+with open("assets/css/custom.css", "r", encoding="utf-8") as f:
+    customCSS = f.read()
+with gr.Blocks(css=customCSS) as demo:
+    # Variables
+    motion_uploaded = gr.State({
+        "feats": None,
+        "joints": None,
+        "motion_video": None,
+        "motion_lengths": 0,
+        "motion_token": None,
+        "motion_token_string": '',
+        "motion_token_length": 0,
+    })
+    data_stored = gr.State([])
+    gr.Markdown("# MotionGPT")
+    chatbot = gr.Chatbot(Init_chatbot,
+                         elem_id="mGPT",
+                         height=600,
+                         label="MotionGPT",
+                         avatar_images=(None,
+                                        ("assets/images/avatar_bot.jpg")),
+                         bubble_full_width=False)
+    with gr.Row():
+        with gr.Column(scale=0.85):
+            with gr.Row():
+                txt = gr.Textbox(
+                    label="Text",
+                    show_label=False,
+                    placeholder=
+                    "Enter text and press ENTER or speak to input. You can also upload motion.",
+                    container=False)
+            with gr.Row():
+                aud = gr.Audio(source="microphone",
+                               label="Speak input",
+                               type='filepath')
+                btn = gr.UploadButton("📁 Upload motion",
+                                      elem_id="upload",
+                                      file_types=["file"],
+                                      variant='primary')
+                regen = gr.Button("🔄 Regenerate", elem_id="regen")
+                clear = gr.ClearButton([txt, chatbot, aud], value='🗑️ Clear')
+            with gr.Row():
+                gr.Markdown('''
+                ### You can get more examples (pre-generated for faster response) by clicking the buttons below:
+                ''')
+            with gr.Row():
+                instruct = gr.Button("Instructions", elem_id="instruction")
+                t2m_eg = gr.Button("Text-to-Motion", elem_id="t2m")
+                m2t_eg = gr.Button("Motion-to-Text", elem_id="m2t")
+                t2t_eg = gr.Button("Random description", elem_id="t2t")
+        with gr.Column(scale=0.15, min_width=150):
+            method = gr.Dropdown(["slow", "fast"],
+                                 label="Visulization method",
+                                 interactive=True,
+                                 elem_id="method",
+                                 value="fast")
+            language = gr.Dropdown(["English", "中文"],
+                                   label="Speech language",
+                                   interactive=True,
+                                   elem_id="language",
+                                   value="English")
+    txt_msg = txt.submit(
+        add_text, [chatbot, txt, motion_uploaded, data_stored, method],
+        [chatbot, txt, motion_uploaded, data_stored],
+        queue=False).then(bot, [chatbot, motion_uploaded, data_stored, method],
+                          [chatbot, motion_uploaded, data_stored])
+    txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
+    file_msg = btn.upload(add_file, [chatbot, btn, txt, motion_uploaded],
+                          [chatbot, txt, motion_uploaded],
+                          queue=False)
+    aud_msg = aud.stop_recording(
+        add_audio, [chatbot, aud, data_stored, language],
+        [chatbot, data_stored],
+        queue=False).then(bot, [chatbot, motion_uploaded, data_stored, method],
+                          [chatbot, motion_uploaded, data_stored])
+    regen_msg = regen.click(bot,
+                            [chatbot, motion_uploaded, data_stored, method],
+                            [chatbot, motion_uploaded, data_stored],
+                            queue=False)
+    chatbot.change(scroll_to_output=True)
+demo.queue()
+if __name__ == "__main__":
+    demo.launch(debug=True)

assets/css/custom.css ADDED Viewed

	@@ -0,0 +1,359 @@

+/* Borrowed from https://huggingface.co/spaces/project-baize/chat-with-baize */
+:root {
+  --chatbot-color-light: #f6f6f6;
+  --chatbot-color-dark: #121111;
+}
+/* Light mode (default) */
+#mGPT {
+  background-color: var(--chatbot-color-light) !important;
+  color: #000000 !important;
+}
+[data-testid='bot'] {
+  background-color: #ffffff !important;
+}
+[data-testid='user'] {
+  background-color: #95ec69 !important;
+}
+/* Dark mode */
+.dark #mGPT {
+  background-color: var(--chatbot-color-dark) !important;
+  color: #ffffff !important;
+}
+.dark [data-testid='bot'] {
+  background-color: #2c2c2c !important;
+}
+.dark [data-testid='user'] {
+  background-color: #26b561 !important;
+}
+#mGPT {
+  height: 100%;
+  min-height: 500px;
+}
+[class*='message-buttons'] {
+  visibility: hidden;
+}
+[class*='message'] {
+  border: none;
+  font-size: var(--text-xl) !important;
+  line-height: var(--line-xl) !important;
+}
+/* [data-testid='bot'] {
+  max-width: 85%;
+  width: auto !important;
+  border-bottom-left-radius: 0 !important;
+}
+[data-testid='user'] {
+  max-width: 85%;
+  width: auto !important;
+  border-bottom-right-radius: 0 !important;
+} */
+/* Text & Video */
+#method {
+  line-height: 1.95 !important;
+}
+.side-content {
+  max-width: 340px;
+}
+/* @media only screen and (min-width: 768px) {
+  .side-content {
+    float: left;
+    overflow-wrap: break-word;
+    padding-right: 2rem;
+  }
+  .side-video {
+    float: right;
+  }
+} */
+/* Buttom */
+#upload {
+  color: #000000;
+}
+.videodl-button {
+  position: absolute;
+  left: 80%;
+  top: 5px;
+  width: 24px;
+  height: 24px;
+}
+.videodl-button svg {
+  width: 24px;
+  height: 24px;
+}
+.npydl-button {
+  position: absolute;
+  left: 90%;
+  top: 5px;
+  width: 24px;
+  height: 24px;
+}
+.npydl-button svg {
+  width: 24px;
+  height: 24px;
+}
+/* Table */
+table {
+  margin: 1em 0;
+  border-collapse: collapse;
+  empty-cells: show;
+}
+td,
+th {
+  border: 1.2px solid var(--border-color-primary) !important;
+  padding: 0.2em;
+}
+thead {
+  background-color: rgba(175, 184, 193, 0.2);
+}
+thead th {
+  padding: 0.5em 0.2em;
+}
+/* Inline code */
+#mGPT code {
+  display: inline;
+  white-space: break-spaces;
+  border-radius: 6px;
+  margin: 0 2px 0 2px;
+  padding: 0.2em 0.4em 0.1em 0.4em;
+  background-color: rgba(175, 184, 193, 0.2);
+}
+/* Code block */
+#mGPT pre code {
+  display: block;
+  overflow: auto;
+  white-space: pre;
+  background-color: hsla(0, 0%, 0%, 80%) !important;
+  border-radius: 10px;
+  padding: 1.4em 1.2em 0em 1.4em;
+  margin: 1.2em 2em 1.2em 0.5em;
+  color: #fff;
+  box-shadow: 6px 6px 16px hsla(0, 0%, 0%, 0.2);
+}
+/* Hightlight */
+#mGPT .highlight {
+  background-color: transparent;
+}
+#mGPT .highlight .hll {
+  background-color: #49483e;
+}
+#mGPT .highlight .c {
+  color: #75715e;
+} /* Comment */
+#mGPT .highlight .err {
+  color: #960050;
+  background-color: #1e0010;
+} /* Error */
+#mGPT .highlight .k {
+  color: #66d9ef;
+} /* Keyword */
+#mGPT .highlight .l {
+  color: #ae81ff;
+} /* Literal */
+#mGPT .highlight .n {
+  color: #f8f8f2;
+} /* Name */
+#mGPT .highlight .o {
+  color: #f92672;
+} /* Operator */
+#mGPT .highlight .p {
+  color: #f8f8f2;
+} /* Punctuation */
+#mGPT .highlight .ch {
+  color: #75715e;
+} /* Comment.Hashbang */
+#mGPT .highlight .cm {
+  color: #75715e;
+} /* Comment.Multiline */
+#mGPT .highlight .cp {
+  color: #75715e;
+} /* Comment.Preproc */
+#mGPT .highlight .cpf {
+  color: #75715e;
+} /* Comment.PreprocFile */
+#mGPT .highlight .c1 {
+  color: #75715e;
+} /* Comment.Single */
+#mGPT .highlight .cs {
+  color: #75715e;
+} /* Comment.Special */
+#mGPT .highlight .gd {
+  color: #f92672;
+} /* Generic.Deleted */
+#mGPT .highlight .ge {
+  font-style: italic;
+} /* Generic.Emph */
+#mGPT .highlight .gi {
+  color: #a6e22e;
+} /* Generic.Inserted */
+#mGPT .highlight .gs {
+  font-weight: bold;
+} /* Generic.Strong */
+#mGPT .highlight .gu {
+  color: #75715e;
+} /* Generic.Subheading */
+#mGPT .highlight .kc {
+  color: #66d9ef;
+} /* Keyword.Constant */
+#mGPT .highlight .kd {
+  color: #66d9ef;
+} /* Keyword.Declaration */
+#mGPT .highlight .kn {
+  color: #f92672;
+} /* Keyword.Namespace */
+#mGPT .highlight .kp {
+  color: #66d9ef;
+} /* Keyword.Pseudo */
+#mGPT .highlight .kr {
+  color: #66d9ef;
+} /* Keyword.Reserved */
+#mGPT .highlight .kt {
+  color: #66d9ef;
+} /* Keyword.Type */
+#mGPT .highlight .ld {
+  color: #e6db74;
+} /* Literal.Date */
+#mGPT .highlight .m {
+  color: #ae81ff;
+} /* Literal.Number */
+#mGPT .highlight .s {
+  color: #e6db74;
+} /* Literal.String */
+#mGPT .highlight .na {
+  color: #a6e22e;
+} /* Name.Attribute */
+#mGPT .highlight .nb {
+  color: #f8f8f2;
+} /* Name.Builtin */
+#mGPT .highlight .nc {
+  color: #a6e22e;
+} /* Name.Class */
+#mGPT .highlight .no {
+  color: #66d9ef;
+} /* Name.Constant */
+#mGPT .highlight .nd {
+  color: #a6e22e;
+} /* Name.Decorator */
+#mGPT .highlight .ni {
+  color: #f8f8f2;
+} /* Name.Entity */
+#mGPT .highlight .ne {
+  color: #a6e22e;
+} /* Name.Exception */
+#mGPT .highlight .nf {
+  color: #a6e22e;
+} /* Name.Function */
+#mGPT .highlight .nl {
+  color: #f8f8f2;
+} /* Name.Label */
+#mGPT .highlight .nn {
+  color: #f8f8f2;
+} /* Name.Namespace */
+#mGPT .highlight .nx {
+  color: #a6e22e;
+} /* Name.Other */
+#mGPT .highlight .py {
+  color: #f8f8f2;
+} /* Name.Property */
+#mGPT .highlight .nt {
+  color: #f92672;
+} /* Name.Tag */
+#mGPT .highlight .nv {
+  color: #f8f8f2;
+} /* Name.Variable */
+#mGPT .highlight .ow {
+  color: #f92672;
+} /* Operator.Word */
+#mGPT .highlight .w {
+  color: #f8f8f2;
+} /* Text.Whitespace */
+#mGPT .highlight .mb {
+  color: #ae81ff;
+} /* Literal.Number.Bin */
+#mGPT .highlight .mf {
+  color: #ae81ff;
+} /* Literal.Number.Float */
+#mGPT .highlight .mh {
+  color: #ae81ff;
+} /* Literal.Number.Hex */
+#mGPT .highlight .mi {
+  color: #ae81ff;
+} /* Literal.Number.Integer */
+#mGPT .highlight .mo {
+  color: #ae81ff;
+} /* Literal.Number.Oct */
+#mGPT .highlight .sa {
+  color: #e6db74;
+} /* Literal.String.Affix */
+#mGPT .highlight .sb {
+  color: #e6db74;
+} /* Literal.String.Backtick */
+#mGPT .highlight .sc {
+  color: #e6db74;
+} /* Literal.String.Char */
+#mGPT .highlight .dl {
+  color: #e6db74;
+} /* Literal.String.Delimiter */
+#mGPT .highlight .sd {
+  color: #e6db74;
+} /* Literal.String.Doc */
+#mGPT .highlight .s2 {
+  color: #e6db74;
+} /* Literal.String.Double */
+#mGPT .highlight .se {
+  color: #ae81ff;
+} /* Literal.String.Escape */
+#mGPT .highlight .sh {
+  color: #e6db74;
+} /* Literal.String.Heredoc */
+#mGPT .highlight .si {
+  color: #e6db74;
+} /* Literal.String.Interpol */
+#mGPT .highlight .sx {
+  color: #e6db74;
+} /* Literal.String.Other */
+#mGPT .highlight .sr {
+  color: #e6db74;
+} /* Literal.String.Regex */
+#mGPT .highlight .s1 {
+  color: #e6db74;
+} /* Literal.String.Single */
+#mGPT .highlight .ss {
+  color: #e6db74;
+} /* Literal.String.Symbol */
+#mGPT .highlight .bp {
+  color: #f8f8f2;
+} /* Name.Builtin.Pseudo */
+#mGPT .highlight .fm {
+  color: #a6e22e;
+} /* Name.Function.Magic */
+#mGPT .highlight .vc {
+  color: #f8f8f2;
+} /* Name.Variable.Class */
+#mGPT .highlight .vg {
+  color: #f8f8f2;
+} /* Name.Variable.Global */
+#mGPT .highlight .vi {
+  color: #f8f8f2;
+} /* Name.Variable.Instance */
+#mGPT .highlight .vm {
+  color: #f8f8f2;
+} /* Name.Variable.Magic */
+#mGPT .highlight .il {
+  color: #ae81ff;
+} /* Literal.Number.Integer.Long */

assets/images/avatar_bot.jpg ADDED Viewed

assets/meta/mean.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0bdb5ba69a3a9e34d71990db15bc535ebc024c8d95ddb5574196f96058faa7d3
+size 2232

assets/meta/mean_eval.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0bdb5ba69a3a9e34d71990db15bc535ebc024c8d95ddb5574196f96058faa7d3
+size 2232

assets/meta/std.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a5f7d60301c9465972fc225f8ad0ee8f957e7720431189123eb6d15873a9557
+size 2232

assets/meta/std_eval.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a5f7d60301c9465972fc225f8ad0ee8f957e7720431189123eb6d15873a9557
+size 2232

assets/videos/m2t_0.mp4 ADDED Viewed

Binary file (500 kB). View file

assets/videos/t2m_0.mp4 ADDED Viewed

Binary file (811 kB). View file

configs/assets.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+CONFIG_FOLDER: configs # Config files path
+FOLDER: experiments # Experiment files saving path
+TEST:
+  FOLDER: results # Testing files saving path
+DATASET:
+  TASK_ROOT: deps/mGPT_instructions
+  SMPL_PATH: deps/smpl
+  TRANSFORM_PATH: deps/transforms/
+  WORD_VERTILIZER_PATH: deps/glove/
+  KIT:
+    ROOT: datasets/kit-ml # KIT directory
+    SPLIT_ROOT: datasets/kit-ml # KIT splits directory
+    MEAN_STD_PATH: deps/t2m/
+  HUMANML3D:
+    ROOT: datasets/humanml3d # HumanML3D directory
+    SPLIT_ROOT: datasets/humanml3d # HumanML3D splits directory
+    MEAN_STD_PATH: deps/t2m/
+METRIC:
+  TM2T:
+    t2m_path: deps/t2m/ # path for tm2t evaluator
+model:
+  whisper_path: openai/whisper-large-v2 # path for whisper model, webui only
+RENDER:
+  BLENDER_PATH: libs/blender-2.93.2-linux-x64/blender
+  SMPL_MODEL_PATH: deps/smpl/smpl_models/smpl
+  MODEL_PATH: deps/smpl/smpl_models/
+  FACES_PATH: deps/smplh/smplh.faces

configs/default.yaml ADDED Viewed

	@@ -0,0 +1,141 @@

+SEED_VALUE: 1234 # Seed value
+DEBUG: True # Debug mode
+FULL_CONFIG: false
+TRAIN:
+  SPLIT: 'train' # Training split name
+  NUM_WORKERS: 8 # Number of workers
+  BATCH_SIZE: 8 # Size of batches
+  END_EPOCH: 2000 # End epoch
+  RESUME: '' # Experiment path to be resumed training
+  PRETRAINED_VAE: '' # Pretrained vae/vqvae model path
+  PRETRAINED: '' # Pretrained model path
+  OPTIM:
+    target: AdamW
+    params:
+      lr: 2e-4
+      betas: [0.9, 0.99]
+      weight_decay: 0.0
+  LR_SCHEDULER:
+    target: CosineAnnealingLR
+    params:
+      T_max: ${eval:${LOGGER.VAL_EVERY_STEPS} * 100}
+      eta_min: 1e-6
+EVAL:
+  SPLIT: 'val' # Validation split name
+  BATCH_SIZE: 16 # Validation Batch size
+  NUM_WORKERS: 8 # Validation Batch size
+TEST:
+  CHECKPOINTS: '' # Pretrained model path
+  SPLIT: 'test' # Testing split name
+  BATCH_SIZE: 16 # Testing Batch size
+  NUM_WORKERS: 8 # Testing Batch size
+  SAVE_PREDICTIONS: False # Weather to save predictions
+  COUNT_TIME: False # Weather to count time during test
+  REPLICATION_TIMES: 20 # Number of times to replicate the test
+  REP_I: 0 # For counting replication times
+model:
+  target: mGPT.models.mgpt.MotionGPT
+  params:
+    condition: 'text'
+    task: 't2m'
+    lm: ${lm.default}
+    motion_vae: ${vq.default}
+    # Related parameters
+    stage: ${TRAIN.STAGE}
+    debug: ${DEBUG}
+    codebook_size: ${model.params.motion_vae.params.code_num}
+    metrics_dict: ${METRIC.TYPE}
+LOSS:
+  LAMBDA_REC: 1.0 # Lambda for reconstruction losses
+  LAMBDA_JOINT: 1.0 # Lambda for joint losses
+  LAMBDA_LATENT: 1e-5 # Lambda for latent losses
+  LAMBDA_KL: 1e-5 # Lambda for kl losses
+  LAMBDA_GEN: 1.0 # Lambda for text-motion generation losses
+  LAMBDA_CROSS: 1.0 # Lambda for cross-reconstruction losses
+  LAMBDA_CYCLE: 1.0 # Lambda for cycle losses
+  LAMBDA_PRIOR: 0.0 # Lambda for diffusion prior losses
+  LAMBDA_VELOCITY: 0.5 # Lambda for velocity losses
+  LAMBDA_COMMIT: 0.02 # Lambda for commitment losses
+  ABLATION:
+    RECONS_LOSS: 'l1_smooth'
+METRIC:
+  TASK: 't2m'
+  FORCE_IN_METER: True
+  DIST_SYNC_ON_STEP: True
+  MM_NUM_SAMPLES: 100 # Number of samples for multimodal test
+  MM_NUM_REPEATS: 30 # Number of repeats for multimodal test
+  MM_NUM_TIMES: 10 # Number of times to repeat the multimodal test
+  DIVERSITY_TIMES: 300 # Number of times to repeat the diversity test
+  TM2T: ${evaluator.tm2t}
+DATASET:
+  target: mGPT.data.HumanML3D.HumanML3DDataModule
+  CODE_PATH: 'VQVAE'
+  TASK_ROOT: ''
+  TASK_PATH: ''
+  NFEATS: 263
+  KIT:
+    MAX_MOTION_LEN: 196
+    MIN_MOTION_LEN: 24
+    MAX_TEXT_LEN: 20
+    PICK_ONE_TEXT: true
+    FRAME_RATE: 12.5
+    UNIT_LEN: 4
+  HUMANML3D:
+    MAX_MOTION_LEN: 196
+    MIN_MOTION_LEN: 40
+    MAX_TEXT_LEN: 20
+    PICK_ONE_TEXT: true
+    FRAME_RATE: 20.0
+    UNIT_LEN: 4
+    STD_TEXT: False
+ABLATION:
+  # For MotionGPT
+  use_length: False
+  predict_ratio: 0.2
+  inbetween_ratio: 0.25
+  image_size: 256
+  # For Motion-latent-diffusion
+  VAE_TYPE: 'actor' # vae ablation: actor or mcross
+  VAE_ARCH: 'encoder_decoder' # mdiffusion vae architecture
+  PE_TYPE: 'actor' # mdiffusion mld or actor
+  DIFF_PE_TYPE: 'actor' # mdiffusion mld or actor
+  SKIP_CONNECT: False # skip connection for denoiser va
+  MLP_DIST: False # use linear to expand mean and std rather expand token nums
+  IS_DIST: False # Mcross distribution kl
+  PREDICT_EPSILON: True # noise or motion
+LOGGER:
+  VAL_EVERY_STEPS: 10
+  LOGGERS: ['tensorboard', 'wandb']
+  TENSORBOARD:
+    target: pytorch_lightning.loggers.TensorBoardLogger
+    params:
+      save_dir: ${FOLDER_EXP}
+      name: 'tensorboard'
+      version: ''
+  WANDB:
+    target: pytorch_lightning.loggers.WandbLogger
+    params:
+      project: null
+      offline: False
+      id: null
+      version: ''
+      name: ${NAME}
+      save_dir: ${FOLDER_EXP}

configs/evaluator/tm2t.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+t2m_textencoder:
+  target: mGPT.archs.tm2t_evaluator.TextEncoderBiGRUCo
+  params:
+    word_size: 300
+    pos_size: 15
+    hidden_size: 512
+    output_size: 512
+t2m_moveencoder:
+  target: mGPT.archs.tm2t_evaluator.MovementConvEncoder
+  params:
+    input_size: ${eval:${DATASET.NFEATS} - 4}
+    hidden_size: 512
+    output_size: 512
+t2m_motionencoder:
+  target: mGPT.archs.tm2t_evaluator.MotionEncoderBiGRUCo
+  params:
+    input_size: ${evaluator.tm2t.t2m_moveencoder.params.output_size}
+    hidden_size: 1024
+    output_size: 512

configs/lm/default.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+target: mGPT.archs.mgpt_lm.MLM
+params:
+  model_type: t5
+  model_path: google/flan-t5-base
+  stage: ${TRAIN.STAGE}
+  motion_codebook_size: ${model.params.codebook_size}
+  ablation: ${ABLATION}

configs/render.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+NAME: '___render_do_not_need_name__' # Experiment name
+ACCELERATOR: 'gpu' # Devices optioncal: “cpu”, “gpu”, “tpu”, “ipu”, “hpu”, “mps, “auto”
+DEVICE: [0] # Index of gpus eg. [0] or [0,1,2,3]
+RENDER:
+  FOLDER: '___no_need__'
+  INPUT_MODE: 'npy'
+  DIR: ''
+  NPY: '___no_need__'
+  DENOISING: True
+  OLDRENDER: True
+  # ["ultra", "high", "med", "low"]
+  # RES: 'high'
+  RES: 'med'
+  DOWNSAMPLE: False
+  FPS: 20.0
+  CANONICALIZE: True
+  EXACT_FRAME: 0.5
+  NUM: 8
+  MODE: '___no_need__' #sequence frame video
+  VID_EXT: mp4
+  ALWAYS_ON_FLOOR: false
+  GT: false

configs/vq/default.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+target: mGPT.archs.mgpt_vq.VQVae
+params:
+  quantizer: 'ema_reset'
+  code_num: 512
+  code_dim: 512
+  output_emb_width: 512
+  down_t: 2
+  stride_t: 2
+  width: 512
+  depth: 3
+  dilation_growth_rate: 3
+  norm: None
+  activation: 'relu'
+  nfeats: ${DATASET.NFEATS}
+  ablation: ${ABLATION}

configs/webui.yaml ADDED Viewed

	@@ -0,0 +1,74 @@

+NAME: Webui # Experiment name
+DEBUG: False # Debug mode
+ACCELERATOR: 'cpu' # Devices optioncal: “cpu”, “gpu”, “tpu”, “ipu”, “hpu”, “mps, “auto”
+DEVICE: [0] # Index of gpus eg. [0] or [0,1,2,3]
+# Training configuration
+TRAIN:
+  #---------------------------------
+  STAGE: lm_instruct
+  DATASETS: ['humanml3d'] # Training datasets
+  NUM_WORKERS: 32 # Number of workers
+  BATCH_SIZE: 16 # Size of batches
+  START_EPOCH: 0 # Start epochMMOTIONENCODER
+  END_EPOCH: 99999 # End epoch
+  ABLATION:
+    pkeep: 0.5
+  OPTIM:
+    TYPE: AdamW # Optimizer type
+    LR: 2e-4 # Learning rate
+    WEIGHT_DECAY: 0.0
+    LR_SCHEDULER: [100, 200, 300, 400]
+    GAMMA: 0.8
+# Evaluating Configuration
+EVAL:
+  DATASETS: ['humanml3d'] # Evaluating datasets
+  BATCH_SIZE: 32 # Evaluating Batch size
+  SPLIT: test
+# Test Configuration
+TEST:
+  CHECKPOINTS: checkpoints/MotionGPT-base/motiongpt_s3_h3d.ckpt
+  DATASETS: ['humanml3d'] # training datasets
+  SPLIT: test
+  BATCH_SIZE: 32 # training Batch size
+  MEAN: False
+  NUM_SAMPLES: 1
+  FACT: 1
+# Datasets Configuration
+DATASET:
+  JOINT_TYPE: 'humanml3d' # join type
+  CODE_PATH: 'VQBEST'
+METRIC:
+  TYPE: ['TM2TMetrics']
+# Losses Configuration
+LOSS:
+  TYPE: t2mgpt # Losses type
+  LAMBDA_FEATURE: 1.0
+  LAMBDA_VELOCITY: 0.5
+  LAMBDA_COMMIT: 0.02
+  LAMBDA_CLS: 1.0
+  LAMBDA_M2T2M: 1.0
+  LAMBDA_T2M2T: 10.0
+  ABLATION:
+    RECONS_LOSS: 'l1_smooth'
+# Model Configuration
+model:
+  target: mGPT.models.mgpt.MotionGPT
+  params:
+    condition: 'text'
+    task: 't2m'
+    lm: ${lm.default}
+    motion_vae: ${vq.default}
+# Logger configuration
+LOGGER:
+  LOG_EVERY_STEPS: 5
+  VAL_EVERY_STEPS: 10
+  TENSORBOARD: True
+  wandb:
+    params:
+      project: null

deps/smpl/smpl_models/SMPL_downsample_index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5b783c1677079397ee4bc26df5c72d73b8bb393bea41fa295b951187443daec
+size 3556

deps/smpl/smpl_models/gmm_08.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1374908aae055a2afa01a2cd9a169bc6cfec1ceb7aa590e201a47b383060491
+size 839127

deps/smpl/smpl_models/neutral_smpl_mean_params.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac9b474c74daec0253ed084720f662059336e976850f08a4a9a3f76d06613776
+size 4848

deps/smpl/smpl_models/smpl.faces ADDED Viewed

Binary file (331 kB). View file

deps/smpl/smpl_models/smpl.tar.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf4793af6b29677b0841c58db392642cb70b477890dc91de01128c7f34738d8d
+size 45

deps/smpl/smpl_models/smpl/SMPL_FEMALE.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a583c1b98e4afc19042641f1bae5cd8a1f712a6724886291a7627ec07acd408d
+size 39056454

deps/smpl/smpl_models/smpl/SMPL_MALE.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e8c0bbbbc635dcb166ed29c303fb4bef16ea5f623e5a89263495a9e403575bd
+size 39056404

deps/smpl/smpl_models/smpl/SMPL_NEUTRAL.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98e65c74ad9b998783132f00880d1025a8d64b158e040e6ef13a557e5098bc42
+size 39001280

deps/smpl/smpl_models/smpl/readme.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ This directory leaves for SMPL models

deps/smpl/smpl_models/smplh/SMPLH_FEMALE.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0fba73ef2494b26de243c1d88a1dbe1047e5566128cf7222c942089543f4560
+size 39708434

deps/smpl/smpl_models/smplh/SMPLH_MALE.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10b617fdd329557937d6fe38e8a542afab236a8887522d9da0bd42e7f2b76eaa
+size 39686902

deps/smpl/smpl_models/smplh/SMPLH_NEUTRAL.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42969b34d8cd383e172515a7bca6ff3b2c37aa2c5c78088c69d20e517fa96026
+size 39708959

deps/smpl/smpl_models/smplh/mano_v1_2.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50976831790ea9657d8110e0c94e50e90eaf35cd76169f0b27e5d32f3fcd951f
+size 175200815

deps/smpl/smpl_models/smplh/smplh.faces ADDED Viewed

Binary file (165 kB). View file

deps/smpl/smpl_models/smplh/smplh.tar.xz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46d5b8687be48c91181fa88271feff3a5e83aa62a481fad8a0bcb9254b2a74f1
+size 113231292

deps/smpl/smpl_models/smplx_parts_segm.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb69c10801205c9cfb5353fdeb1b9cc5ade53d14c265c3339421cdde8b9c91e7
+size 1323168

mGPT/__init__.py ADDED Viewed

File without changes

mGPT/archs/__init__.py ADDED Viewed

File without changes

mGPT/archs/mgpt_lm.py ADDED Viewed

	@@ -0,0 +1,592 @@

+import os
+from typing import List, Union
+import numpy as np
+import math
+import time
+import heapq
+import torch
+from torch import Tensor, nn
+from torch.distributions.distribution import Distribution
+from transformers import AutoModelForSeq2SeqLM, T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer, GPT2LMHeadModel, GPT2Tokenizer
+import random
+from typing import Optional
+from .tools.token_emb import NewTokenEmb
+class MLM(nn.Module):
+    def __init__(
+        self,
+        model_path: str,
+        model_type: str = "t5",
+        stage: str = "lm_pretrain",
+        new_token_type: str = "insert",
+        motion_codebook_size: int = 512,
+        framerate: float = 20.0,
+        down_t: int = 4,
+        predict_ratio: float = 0.2,
+        inbetween_ratio: float = 0.25,
+        max_length: int = 256,
+        lora: bool = False,
+        quota_ratio: float = 0.5,
+        noise_density: float = 0.15,
+        mean_noise_span_length: int = 3,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        # Parameters
+        self.m_codebook_size = motion_codebook_size
+        self.max_length = max_length
+        self.framerate = framerate
+        self.down_t = down_t
+        self.predict_ratio = predict_ratio
+        self.inbetween_ratio = inbetween_ratio
+        self.noise_density = noise_density
+        self.mean_noise_span_length = mean_noise_span_length
+        self.quota_ratio = quota_ratio
+        self.stage = stage
+        # Instantiate language model
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=True)
+        if model_type == "t5":
+            self.language_model = T5ForConditionalGeneration.from_pretrained(
+                model_path)
+            self.lm_type = 'encdec'
+        elif model_type == "gpt2":
+            self.language_model = GPT2LMHeadModel.from_pretrained(model_path)
+            self.lm_type = 'dec'
+        else:
+            raise ValueError("type must be either seq2seq or conditional")
+        if self.lm_type == 'dec':
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Add motion tokens
+        self.tokenizer.add_tokens(
+            [f'<motion_id_{i}>' for i in range(self.m_codebook_size + 3)])
+        if new_token_type == "insert":
+            self.language_model.resize_token_embeddings(len(self.tokenizer))
+        elif new_token_type == "mlp":
+            shared = NewTokenEmb(self.language_model.shared,
+                                 self.m_codebook_size + 3)
+            # lm_head = NewTokenEmb(self.language_model.lm_head,
+            #   self.m_codebook_size + 3)
+            self.language_model.resize_token_embeddings(len(self.tokenizer))
+            self.language_model.shared = shared
+            # self.language_model.lm_head = lm_head
+        # Lora
+        if lora:
+            from peft import LoraConfig, TaskType, get_peft_model, get_peft_model_state_dict
+            from peft.utils.other import fsdp_auto_wrap_policy
+            peft_config = LoraConfig(
+                bias="none",
+                task_type="CAUSAL_LM",
+                #  inference_mode=False,
+                r=8,
+                lora_alpha=16,
+                lora_dropout=0.05)
+            self.language_model = get_peft_model(self.language_model,
+                                                 peft_config)
+    def forward(self, texts: List[str], motion_tokens: Tensor,
+                lengths: List[int], tasks: dict):
+        if self.lm_type == 'encdec':
+            return self.forward_encdec(texts, motion_tokens, lengths, tasks)
+        elif self.lm_type == 'dec':
+            return self.forward_dec(texts, motion_tokens, lengths, tasks)
+        else:
+            raise NotImplementedError("Only conditional_multitask supported")
+    def forward_encdec(
+        self,
+        texts: List[str],
+        motion_tokens: Tensor,
+        lengths: List[int],
+        tasks: dict,
+    ):
+        # Tensor to string
+        motion_strings = self.motion_token_to_string(motion_tokens, lengths)
+        # Supervised or unsupervised
+        # condition = random.choice(
+        #     ['text', 'motion', 'supervised', 'supervised', 'supervised'])
+        condition = random.choice(['supervised', 'supervised', 'supervised'])
+        if condition == 'text':
+            inputs = texts
+            outputs = texts
+        elif condition == 'motion':
+            inputs = motion_strings
+            outputs = motion_strings
+        else:
+            inputs, outputs = self.template_fulfill(tasks, lengths,
+                                                    motion_strings, texts)
+        # Tokenize
+        source_encoding = self.tokenizer(inputs,
+                                         padding='max_length',
+                                         max_length=self.max_length,
+                                         truncation=True,
+                                         return_attention_mask=True,
+                                         add_special_tokens=True,
+                                         return_tensors="pt")
+        source_attention_mask = source_encoding.attention_mask.to(
+            motion_tokens.device)
+        source_input_ids = source_encoding.input_ids.to(motion_tokens.device)
+        if condition in ['text', 'motion']:
+            batch_size, expandend_input_length = source_input_ids.shape
+            mask_indices = np.asarray([
+                self.random_spans_noise_mask(expandend_input_length)
+                for i in range(batch_size)
+            ])
+            target_mask = ~mask_indices
+            input_ids_sentinel = self.create_sentinel_ids(
+                mask_indices.astype(np.int8))
+            target_sentinel = self.create_sentinel_ids(
+                target_mask.astype(np.int8))
+            labels_input_ids = self.filter_input_ids(source_input_ids,
+                                                     target_sentinel)
+            source_input_ids = self.filter_input_ids(source_input_ids,
+                                                     input_ids_sentinel)
+        else:
+            target_inputs = self.tokenizer(outputs,
+                                           padding='max_length',
+                                           max_length=self.max_length,
+                                           truncation=True,
+                                           return_attention_mask=True,
+                                           add_special_tokens=True,
+                                           return_tensors="pt")
+            labels_input_ids = target_inputs.input_ids.to(motion_tokens.device)
+            lables_attention_mask = target_inputs.attention_mask.to(
+                motion_tokens.device)
+        labels_input_ids[labels_input_ids == 0] = -100
+        outputs = self.language_model(
+            input_ids=source_input_ids,
+            attention_mask=source_attention_mask
+            if condition == 'supervised' else None,
+            labels=labels_input_ids,
+            decoder_attention_mask=lables_attention_mask
+            if condition == 'supervised' else None,
+        )
+        return outputs
+    def forward_dec(
+        self,
+        texts: List[str],
+        motion_tokens: Tensor,
+        lengths: List[int],
+        tasks: dict,
+    ):
+        self.tokenizer.padding_side = "right"
+        # Tensor to string
+        motion_strings = self.motion_token_to_string(motion_tokens, lengths)
+        # Supervised or unsupervised
+        condition = random.choice(
+            ['text', 'motion', 'supervised', 'supervised', 'supervised'])
+        if condition == 'text':
+            labels = texts
+        elif condition == 'motion':
+            labels = motion_strings
+        else:
+            inputs, outputs = self.template_fulfill(tasks, lengths,
+                                                    motion_strings, texts)
+            labels = []
+            for i in range(len(inputs)):
+                labels.append(inputs[i] + ' \n ' + outputs[i] +
+                              self.tokenizer.eos_token)
+        # Tokenize
+        inputs = self.tokenizer(labels,
+                                padding='max_length',
+                                max_length=self.max_length,
+                                truncation=True,
+                                return_attention_mask=True,
+                                return_tensors="pt")
+        labels_input_ids = inputs.input_ids.to(motion_tokens.device)
+        lables_attention_mask = inputs.attention_mask.to(motion_tokens.device)
+        # print(labels_input_ids[0:5])
+        outputs = self.language_model(input_ids=labels_input_ids,
+                                      attention_mask=lables_attention_mask,
+                                      labels=inputs["input_ids"])
+        return outputs
+    def generate_direct(self,
+                        texts: List[str],
+                        max_length: int = 256,
+                        num_beams: int = 1,
+                        do_sample: bool = True,
+                        bad_words_ids: List[int] = None):
+        # Device
+        self.device = self.language_model.device
+        # Tokenize
+        if self.lm_type == 'dec':
+            texts = [text + " \n " for text in texts]
+        source_encoding = self.tokenizer(texts,
+                                         padding='max_length',
+                                         max_length=self.max_length,
+                                         truncation=True,
+                                         return_attention_mask=True,
+                                         add_special_tokens=True,
+                                         return_tensors="pt")
+        source_input_ids = source_encoding.input_ids.to(self.device)
+        source_attention_mask = source_encoding.attention_mask.to(self.device)
+        if self.lm_type == 'encdec':
+            outputs = self.language_model.generate(
+                source_input_ids,
+                max_length=max_length,
+                num_beams=num_beams,
+                do_sample=do_sample,
+                bad_words_ids=bad_words_ids,
+            )
+        elif self.lm_type == 'dec':
+            outputs = self.language_model.generate(
+                input_ids=source_input_ids,
+                attention_mask=source_attention_mask,
+                pad_token_id=self.tokenizer.pad_token_id,
+                do_sample=do_sample,
+                max_new_tokens=max_length)
+            self.tokenizer.padding_side = 'left'
+        outputs_string = self.tokenizer.batch_decode(outputs,
+                                                     skip_special_tokens=True)
+        print(texts[:2])
+        print(outputs_string[:2])
+        outputs_tokens, cleaned_text = self.motion_string_to_token(
+            outputs_string)
+        return outputs_tokens, cleaned_text
+    def generate_conditional(self,
+                             texts: Optional[List[str]] = None,
+                             motion_tokens: Optional[Tensor] = None,
+                             lengths: Optional[List[int]] = None,
+                             task: str = "t2m",
+                             with_len: bool = False,
+                             stage: str = 'train',
+                             tasks: dict = None):
+        self.device = self.language_model.device
+        if task in ["t2m", "m2m", "pred", "inbetween"]:
+            if task == "t2m":
+                assert texts is not None
+                motion_strings = [''] * len(texts)
+                if not with_len:
+                    if tasks is None:
+                        tasks = [{
+                            'input':
+                            ['Generate motion: <Caption_Placeholder>'],
+                            'output': ['']
+                        }] * len(texts)
+                    lengths = [0] * len(texts)
+                else:
+                    tasks = [{
+                        'input': [
+                            'Generate motion with <Frame_Placeholder> frames: <Caption_Placeholder>'
+                        ],
+                        'output': ['']
+                    }] * len(texts)
+            elif task == "pred":
+                assert motion_tokens is not None and lengths is not None
+                texts = [''] * len(lengths)
+                tasks = [{
+                    'input': ['Predict motion: <Motion_Placeholder_s1>'],
+                    'output': ['']
+                }] * len(lengths)
+                motion_strings_old = self.motion_token_to_string(
+                    motion_tokens, lengths)
+                motion_strings = []
+                for i, length in enumerate(lengths):
+                    split = length // 5
+                    motion_strings.append(
+                        '>'.join(motion_strings_old[i].split('>')[:split]) +
+                        '>')
+            elif task == "inbetween":
+                assert motion_tokens is not None and lengths is not None
+                texts = [''] * len(lengths)
+                tasks = [{
+                    'input': [
+                        "Complete the masked motion: <Motion_Placeholder_Masked>"
+                    ],
+                    'output': ['']
+                }] * len(lengths)
+                motion_strings = self.motion_token_to_string(
+                    motion_tokens, lengths)
+            inputs, outputs = self.template_fulfill(tasks, lengths,
+                                                    motion_strings, texts,
+                                                    stage)
+            outputs_tokens, cleaned_text = self.generate_direct(inputs,
+                                                                max_length=128,
+                                                                num_beams=1,
+                                                                do_sample=True)
+            return outputs_tokens
+        elif task == "m2t":
+            assert motion_tokens is not None and lengths is not None
+            motion_strings = self.motion_token_to_string(
+                motion_tokens, lengths)
+            if not with_len:
+                tasks = [{
+                    'input': ['Generate text: <Motion_Placeholder>'],
+                    'output': ['']
+                }] * len(lengths)
+            else:
+                tasks = [{
+                    'input': [
+                        'Generate text with <Frame_Placeholder> frames: <Motion_Placeholder>'
+                    ],
+                    'output': ['']
+                }] * len(lengths)
+            texts = [''] * len(lengths)
+            inputs, outputs = self.template_fulfill(tasks, lengths,
+                                                    motion_strings, texts)
+            outputs_tokens, cleaned_text = self.generate_direct(
+                inputs,
+                max_length=40,
+                num_beams=1,
+                do_sample=False,
+                # bad_words_ids=self.bad_words_ids
+            )
+            return cleaned_text
+    def motion_token_to_string(self, motion_token: Tensor, lengths: List[int]):
+        motion_string = []
+        for i in range(len(motion_token)):
+            motion_i = motion_token[i].cpu(
+            ) if motion_token[i].device.type == 'cuda' else motion_token[i]
+            motion_list = motion_i.tolist()[:lengths[i]]
+            motion_string.append(
+                (f'<motion_id_{self.m_codebook_size}>' +
+                 ''.join([f'<motion_id_{int(i)}>' for i in motion_list]) +
+                 f'<motion_id_{self.m_codebook_size + 1}>'))
+        return motion_string
+    def motion_token_list_to_string(self, motion_token: Tensor):
+        motion_string = []
+        for i in range(len(motion_token)):
+            motion_i = motion_token[i].cpu(
+            ) if motion_token[i].device.type == 'cuda' else motion_token[i]
+            motion_list = motion_i.tolist()
+            motion_string.append(
+                (f'<motion_id_{self.m_codebook_size}>' +
+                 ''.join([f'<motion_id_{int(i)}>' for i in motion_list]) +
+                 f'<motion_id_{self.m_codebook_size + 1}>'))
+        return motion_string
+    def motion_string_to_token(self, motion_string: List[str]):
+        motion_tokens = []
+        output_string = []
+        for i in range(len(motion_string)):
+            string = self.get_middle_str(
+                motion_string[i], f'<motion_id_{self.m_codebook_size}>',
+                f'<motion_id_{self.m_codebook_size + 1}>')
+            string_list = string.split('><')
+            token_list = [
+                int(i.split('_')[-1].replace('>', ''))
+                for i in string_list[1:-1]
+            ]
+            if len(token_list) == 0:
+                token_list = [0]
+            token_list_padded = torch.tensor(token_list,
+                                             dtype=int).to(self.device)
+            motion_tokens.append(token_list_padded)
+            output_string.append(motion_string[i].replace(
+                string, '<Motion_Placeholder>'))
+        return motion_tokens, output_string
+    def placeholder_fulfill(self, prompt: str, length: int, motion_string: str,
+                            text: str):
+        seconds = math.floor(length / self.framerate)
+        motion_splited = motion_string.split('>')
+        token_length = length / self.down_t
+        predict_head = int(token_length * self.predict_ratio + 1)
+        masked_head = int(token_length * self.inbetween_ratio + 1)
+        masked_tail = int(token_length * (1 - self.inbetween_ratio) + 1)
+        motion_predict_head = '>'.join(
+            motion_splited[:predict_head]
+        ) + f'><motion_id_{self.m_codebook_size+1}>'
+        motion_predict_last = f'<motion_id_{self.m_codebook_size}>' + '>'.join(
+            motion_splited[predict_head:])
+        motion_masked = '>'.join(
+            motion_splited[:masked_head]
+        ) + '>' + f'<motion_id_{self.m_codebook_size+2}>' * (
+            masked_tail - masked_head) + '>'.join(motion_splited[masked_tail:])
+        if random.random() < self.quota_ratio:
+            text = f'\"{text}\"'
+        prompt = prompt.replace('<Caption_Placeholder>', text).replace(
+            '<Motion_Placeholder>',
+            motion_string).replace('<Frame_Placeholder>', f'{length}').replace(
+                '<Second_Placeholder>', '%.1f' % seconds).replace(
+                    '<Motion_Placeholder_s1>', motion_predict_head).replace(
+                        '<Motion_Placeholder_s2>',
+                        motion_predict_last).replace(
+                            '<Motion_Placeholder_Masked>', motion_masked)
+        return prompt
+    def template_fulfill(self,
+                         tasks,
+                         lengths,
+                         motion_strings,
+                         texts,
+                         stage='test'):
+        inputs = []
+        outputs = []
+        for i in range(len(lengths)):
+            input_template = random.choice(tasks[i]['input'])
+            output_template = random.choice(tasks[i]['output'])
+            length = lengths[i]
+            inputs.append(
+                self.placeholder_fulfill(input_template, length,
+                                         motion_strings[i], texts[i]))
+            outputs.append(
+                self.placeholder_fulfill(output_template, length,
+                                         motion_strings[i], texts[i]))
+        return inputs, outputs
+    def get_middle_str(self, content, startStr, endStr):
+        try:
+            startIndex = content.index(startStr)
+            if startIndex >= 0:
+                startIndex += len(startStr)
+            endIndex = content.index(endStr)
+        except:
+            return f'<motion_id_{self.m_codebook_size}><motion_id_0><motion_id_{self.m_codebook_size+1}>'
+        return f'<motion_id_{self.m_codebook_size}>' + content[
+            startIndex:endIndex] + f'<motion_id_{self.m_codebook_size+1}>'
+    def random_spans_noise_mask(self, length):
+        # From https://github.com/google-research/text-to-text-transfer-transformer/blob/84f8bcc14b5f2c03de51bd3587609ba8f6bbd1cd/t5/data/preprocessors.py
+        orig_length = length
+        num_noise_tokens = int(np.round(length * self.noise_density))
+        # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens.
+        num_noise_tokens = min(max(num_noise_tokens, 1), length - 1)
+        num_noise_spans = int(
+            np.round(num_noise_tokens / self.mean_noise_span_length))
+        # avoid degeneracy by ensuring positive number of noise spans
+        num_noise_spans = max(num_noise_spans, 1)
+        num_nonnoise_tokens = length - num_noise_tokens
+        # pick the lengths of the noise spans and the non-noise spans
+        def _random_segmentation(num_items, num_segments):
+            """Partition a sequence of items randomly into non-empty segments.
+            Args:
+                num_items: an integer scalar > 0
+                num_segments: an integer scalar in [1, num_items]
+            Returns:
+                a Tensor with shape [num_segments] containing positive integers that add
+                up to num_items
+            """
+            mask_indices = np.arange(num_items - 1) < (num_segments - 1)
+            np.random.shuffle(mask_indices)
+            first_in_segment = np.pad(mask_indices, [[1, 0]])
+            segment_id = np.cumsum(first_in_segment)
+            # count length of sub segments assuming that list is sorted
+            _, segment_length = np.unique(segment_id, return_counts=True)
+            return segment_length
+        noise_span_lengths = _random_segmentation(num_noise_tokens,
+                                                  num_noise_spans)
+        nonnoise_span_lengths = _random_segmentation(num_nonnoise_tokens,
+                                                     num_noise_spans)
+        interleaved_span_lengths = np.reshape(
+            np.stack([nonnoise_span_lengths, noise_span_lengths], axis=1),
+            [num_noise_spans * 2],
+        )
+        span_starts = np.cumsum(interleaved_span_lengths)[:-1]
+        span_start_indicator = np.zeros((length, ), dtype=np.int8)
+        span_start_indicator[span_starts] = True
+        span_num = np.cumsum(span_start_indicator)
+        is_noise = np.equal(span_num % 2, 1)
+        return is_noise[:orig_length]
+    def create_sentinel_ids(self, mask_indices):
+        # From https://github.com/huggingface/transformers/blob/main/examples/flax/language-modeling/run_t5_mlm_flax.py
+        start_indices = mask_indices - np.roll(mask_indices, 1,
+                                               axis=-1) * mask_indices
+        start_indices[:, 0] = mask_indices[:, 0]
+        sentinel_ids = np.where(start_indices != 0,
+                                np.cumsum(start_indices, axis=-1),
+                                start_indices)
+        sentinel_ids = np.where(sentinel_ids != 0,
+                                (len(self.tokenizer) - sentinel_ids), 0)
+        sentinel_ids -= mask_indices - start_indices
+        return sentinel_ids
+    def filter_input_ids(self, input_ids, sentinel_ids):
+        # From https://github.com/huggingface/transformers/blob/main/examples/flax/language-modeling/run_t5_mlm_flax.py
+        batch_size = input_ids.shape[0]
+        input_ids_full = np.where(sentinel_ids != 0, sentinel_ids,
+                                  input_ids.to('cpu'))
+        # input_ids tokens and sentinel tokens are >= 0, tokens < 0 are
+        # masked tokens coming after sentinel tokens and should be removed
+        input_ids = input_ids_full[input_ids_full >= 0].reshape(
+            (batch_size, -1))
+        input_ids = np.concatenate(
+            [
+                input_ids,
+                np.full((batch_size, 1),
+                        self.tokenizer.eos_token_id,
+                        dtype=np.int32),
+            ],
+            axis=-1,
+        )
+        input_ids = torch.tensor(input_ids, device=self.device)
+        return input_ids

mGPT/archs/mgpt_vq.py ADDED Viewed

	@@ -0,0 +1,190 @@

+# Partially from https://github.com/Mael-zys/T2M-GPT
+from typing import List, Optional, Union
+import torch
+import torch.nn as nn
+from torch import Tensor, nn
+from torch.distributions.distribution import Distribution
+from .tools.resnet import Resnet1D
+from .tools.quantize_cnn import QuantizeEMAReset, Quantizer, QuantizeEMA, QuantizeReset
+from collections import OrderedDict
+class VQVae(nn.Module):
+    def __init__(self,
+                 nfeats: int,
+                 quantizer: str = "ema_reset",
+                 code_num=512,
+                 code_dim=512,
+                 output_emb_width=512,
+                 down_t=3,
+                 stride_t=2,
+                 width=512,
+                 depth=3,
+                 dilation_growth_rate=3,
+                 norm=None,
+                 activation: str = "relu",
+                 **kwargs) -> None:
+        super().__init__()
+        self.code_dim = code_dim
+        self.encoder = Encoder(nfeats,
+                               output_emb_width,
+                               down_t,
+                               stride_t,
+                               width,
+                               depth,
+                               dilation_growth_rate,
+                               activation=activation,
+                               norm=norm)
+        self.decoder = Decoder(nfeats,
+                               output_emb_width,
+                               down_t,
+                               stride_t,
+                               width,
+                               depth,
+                               dilation_growth_rate,
+                               activation=activation,
+                               norm=norm)
+        if quantizer == "ema_reset":
+            self.quantizer = QuantizeEMAReset(code_num, code_dim, mu=0.99)
+        elif quantizer == "orig":
+            self.quantizer = Quantizer(code_num, code_dim, beta=1.0)
+        elif quantizer == "ema":
+            self.quantizer = QuantizeEMA(code_num, code_dim, mu=0.99)
+        elif quantizer == "reset":
+            self.quantizer = QuantizeReset(code_num, code_dim)
+    def preprocess(self, x):
+        # (bs, T, Jx3) -> (bs, Jx3, T)
+        x = x.permute(0, 2, 1)
+        return x
+    def postprocess(self, x):
+        # (bs, Jx3, T) ->  (bs, T, Jx3)
+        x = x.permute(0, 2, 1)
+        return x
+    def forward(self, features: Tensor):
+        # Preprocess
+        x_in = self.preprocess(features)
+        # Encode
+        x_encoder = self.encoder(x_in)
+        # quantization
+        x_quantized, loss, perplexity = self.quantizer(x_encoder)
+        # decoder
+        x_decoder = self.decoder(x_quantized)
+        x_out = self.postprocess(x_decoder)
+        return x_out, loss, perplexity
+    def encode(
+        self,
+        features: Tensor,
+    ) -> Union[Tensor, Distribution]:
+        N, T, _ = features.shape
+        x_in = self.preprocess(features)
+        x_encoder = self.encoder(x_in)
+        x_encoder = self.postprocess(x_encoder)
+        x_encoder = x_encoder.contiguous().view(-1,
+                                                x_encoder.shape[-1])  # (NT, C)
+        code_idx = self.quantizer.quantize(x_encoder)
+        code_idx = code_idx.view(N, -1)
+        # latent, dist
+        return code_idx, None
+    def decode(self, z: Tensor):
+        x_d = self.quantizer.dequantize(z)
+        x_d = x_d.view(1, -1, self.code_dim).permute(0, 2, 1).contiguous()
+        # decoder
+        x_decoder = self.decoder(x_d)
+        x_out = self.postprocess(x_decoder)
+        return x_out
+class Encoder(nn.Module):
+    def __init__(self,
+                 input_emb_width=3,
+                 output_emb_width=512,
+                 down_t=3,
+                 stride_t=2,
+                 width=512,
+                 depth=3,
+                 dilation_growth_rate=3,
+                 activation='relu',
+                 norm=None):
+        super().__init__()
+        blocks = []
+        filter_t, pad_t = stride_t * 2, stride_t // 2
+        blocks.append(nn.Conv1d(input_emb_width, width, 3, 1, 1))
+        blocks.append(nn.ReLU())
+        for i in range(down_t):
+            input_dim = width
+            block = nn.Sequential(
+                nn.Conv1d(input_dim, width, filter_t, stride_t, pad_t),
+                Resnet1D(width,
+                         depth,
+                         dilation_growth_rate,
+                         activation=activation,
+                         norm=norm),
+            )
+            blocks.append(block)
+        blocks.append(nn.Conv1d(width, output_emb_width, 3, 1, 1))
+        self.model = nn.Sequential(*blocks)
+    def forward(self, x):
+        return self.model(x)
+class Decoder(nn.Module):
+    def __init__(self,
+                 input_emb_width=3,
+                 output_emb_width=512,
+                 down_t=3,
+                 stride_t=2,
+                 width=512,
+                 depth=3,
+                 dilation_growth_rate=3,
+                 activation='relu',
+                 norm=None):
+        super().__init__()
+        blocks = []
+        filter_t, pad_t = stride_t * 2, stride_t // 2
+        blocks.append(nn.Conv1d(output_emb_width, width, 3, 1, 1))
+        blocks.append(nn.ReLU())
+        for i in range(down_t):
+            out_dim = width
+            block = nn.Sequential(
+                Resnet1D(width,
+                         depth,
+                         dilation_growth_rate,
+                         reverse_dilation=True,
+                         activation=activation,
+                         norm=norm), nn.Upsample(scale_factor=2,
+                                                 mode='nearest'),
+                nn.Conv1d(width, out_dim, 3, 1, 1))
+            blocks.append(block)
+        blocks.append(nn.Conv1d(width, width, 3, 1, 1))
+        blocks.append(nn.ReLU())
+        blocks.append(nn.Conv1d(width, input_emb_width, 3, 1, 1))
+        self.model = nn.Sequential(*blocks)
+    def forward(self, x):
+        return self.model(x)

mGPT/archs/tm2t_evaluator.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import pack_padded_sequence
+class MovementConvEncoder(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size):
+        super(MovementConvEncoder, self).__init__()
+        self.main = nn.Sequential(
+            nn.Conv1d(input_size, hidden_size, 4, 2, 1),
+            nn.Dropout(0.2, inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv1d(hidden_size, output_size, 4, 2, 1),
+            nn.Dropout(0.2, inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+        )
+        self.out_net = nn.Linear(output_size, output_size)
+        # self.main.apply(init_weight)
+        # self.out_net.apply(init_weight)
+    def forward(self, inputs):
+        inputs = inputs.permute(0, 2, 1)
+        outputs = self.main(inputs).permute(0, 2, 1)
+        # print(outputs.shape)
+        return self.out_net(outputs)
+class MotionEncoderBiGRUCo(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size):
+        super(MotionEncoderBiGRUCo, self).__init__()
+        self.input_emb = nn.Linear(input_size, hidden_size)
+        self.gru = nn.GRU(
+            hidden_size, hidden_size, batch_first=True, bidirectional=True
+        )
+        self.output_net = nn.Sequential(
+            nn.Linear(hidden_size * 2, hidden_size),
+            nn.LayerNorm(hidden_size),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Linear(hidden_size, output_size),
+        )
+        # self.input_emb.apply(init_weight)
+        # self.output_net.apply(init_weight)
+        self.hidden_size = hidden_size
+        self.hidden = nn.Parameter(
+            torch.randn((2, 1, self.hidden_size), requires_grad=True)
+        )
+    # input(batch_size, seq_len, dim)
+    def forward(self, inputs, m_lens):
+        num_samples = inputs.shape[0]
+        input_embs = self.input_emb(inputs)
+        hidden = self.hidden.repeat(1, num_samples, 1)
+        cap_lens = m_lens.data.tolist()
+        # emb = pack_padded_sequence(input=input_embs, lengths=cap_lens, batch_first=True)
+        emb = input_embs
+        gru_seq, gru_last = self.gru(emb, hidden)
+        gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1)
+        return self.output_net(gru_last)
+class TextEncoderBiGRUCo(nn.Module):
+    def __init__(self, word_size, pos_size, hidden_size, output_size):
+        super(TextEncoderBiGRUCo, self).__init__()
+        self.pos_emb = nn.Linear(pos_size, word_size)
+        self.input_emb = nn.Linear(word_size, hidden_size)
+        self.gru = nn.GRU(
+            hidden_size, hidden_size, batch_first=True, bidirectional=True
+        )
+        self.output_net = nn.Sequential(
+            nn.Linear(hidden_size * 2, hidden_size),
+            nn.LayerNorm(hidden_size),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Linear(hidden_size, output_size),
+        )
+        # self.input_emb.apply(init_weight)
+        # self.pos_emb.apply(init_weight)
+        # self.output_net.apply(init_weight)
+        # self.linear2.apply(init_weight)
+        # self.batch_size = batch_size
+        self.hidden_size = hidden_size
+        self.hidden = nn.Parameter(
+            torch.randn((2, 1, self.hidden_size), requires_grad=True)
+        )
+    # input(batch_size, seq_len, dim)
+    def forward(self, word_embs, pos_onehot, cap_lens):
+        num_samples = word_embs.shape[0]
+        pos_embs = self.pos_emb(pos_onehot)
+        inputs = word_embs + pos_embs
+        input_embs = self.input_emb(inputs)
+        hidden = self.hidden.repeat(1, num_samples, 1)
+        cap_lens = cap_lens.data.tolist()
+        emb = pack_padded_sequence(input=input_embs, lengths=cap_lens, batch_first=True)
+        gru_seq, gru_last = self.gru(emb, hidden)
+        gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1)
+        return self.output_net(gru_last)

mGPT/archs/tools/embeddings.py ADDED Viewed

	@@ -0,0 +1,322 @@

+# This file is taken from signjoey repository
+import math
+import torch
+from torch import Tensor, nn
+def get_activation(activation_type):
+    if activation_type == "relu":
+        return nn.ReLU()
+    elif activation_type == "relu6":
+        return nn.ReLU6()
+    elif activation_type == "prelu":
+        return nn.PReLU()
+    elif activation_type == "selu":
+        return nn.SELU()
+    elif activation_type == "celu":
+        return nn.CELU()
+    elif activation_type == "gelu":
+        return nn.GELU()
+    elif activation_type == "sigmoid":
+        return nn.Sigmoid()
+    elif activation_type == "softplus":
+        return nn.Softplus()
+    elif activation_type == "softshrink":
+        return nn.Softshrink()
+    elif activation_type == "softsign":
+        return nn.Softsign()
+    elif activation_type == "tanh":
+        return nn.Tanh()
+    elif activation_type == "tanhshrink":
+        return nn.Tanhshrink()
+    else:
+        raise ValueError("Unknown activation type {}".format(activation_type))
+class MaskedNorm(nn.Module):
+    """
+        Original Code from:
+        https://discuss.pytorch.org/t/batchnorm-for-different-sized-samples-in-batch/44251/8
+    """
+    def __init__(self, norm_type, num_groups, num_features):
+        super().__init__()
+        self.norm_type = norm_type
+        if self.norm_type == "batch":
+            self.norm = nn.BatchNorm1d(num_features=num_features)
+        elif self.norm_type == "group":
+            self.norm = nn.GroupNorm(num_groups=num_groups, num_channels=num_features)
+        elif self.norm_type == "layer":
+            self.norm = nn.LayerNorm(normalized_shape=num_features)
+        else:
+            raise ValueError("Unsupported Normalization Layer")
+        self.num_features = num_features
+    def forward(self, x: Tensor, mask: Tensor):
+        if self.training:
+            reshaped = x.reshape([-1, self.num_features])
+            reshaped_mask = mask.reshape([-1, 1]) > 0
+            selected = torch.masked_select(reshaped, reshaped_mask).reshape(
+                [-1, self.num_features]
+            )
+            batch_normed = self.norm(selected)
+            scattered = reshaped.masked_scatter(reshaped_mask, batch_normed)
+            return scattered.reshape([x.shape[0], -1, self.num_features])
+        else:
+            reshaped = x.reshape([-1, self.num_features])
+            batched_normed = self.norm(reshaped)
+            return batched_normed.reshape([x.shape[0], -1, self.num_features])
+# TODO (Cihan): Spatial and Word Embeddings are pretty much the same
+#       We might as well convert them into a single module class.
+#       Only difference is the lut vs linear layers.
+class Embeddings(nn.Module):
+    """
+    Simple embeddings class
+    """
+    # pylint: disable=unused-argument
+    def __init__(
+        self,
+        embedding_dim: int = 64,
+        num_heads: int = 8,
+        scale: bool = False,
+        scale_factor: float = None,
+        norm_type: str = None,
+        activation_type: str = None,
+        vocab_size: int = 0,
+        padding_idx: int = 1,
+        freeze: bool = False,
+        **kwargs
+    ):
+        """
+        Create new embeddings for the vocabulary.
+        Use scaling for the Transformer.
+        :param embedding_dim:
+        :param scale:
+        :param vocab_size:
+        :param padding_idx:
+        :param freeze: freeze the embeddings during training
+        """
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.vocab_size = vocab_size
+        self.lut = nn.Embedding(vocab_size, self.embedding_dim, padding_idx=padding_idx)
+        self.norm_type = norm_type
+        if self.norm_type:
+            self.norm = MaskedNorm(
+                norm_type=norm_type, num_groups=num_heads, num_features=embedding_dim
+            )
+        self.activation_type = activation_type
+        if self.activation_type:
+            self.activation = get_activation(activation_type)
+        self.scale = scale
+        if self.scale:
+            if scale_factor:
+                self.scale_factor = scale_factor
+            else:
+                self.scale_factor = math.sqrt(self.embedding_dim)
+        if freeze:
+            freeze_params(self)
+    # pylint: disable=arguments-differ
+    def forward(self, x: Tensor, mask: Tensor = None) -> Tensor:
+        """
+        Perform lookup for input `x` in the embedding table.
+        :param mask: token masks
+        :param x: index in the vocabulary
+        :return: embedded representation for `x`
+        """
+        x = self.lut(x)
+        if self.norm_type:
+            x = self.norm(x, mask)
+        if self.activation_type:
+            x = self.activation(x)
+        if self.scale:
+            return x * self.scale_factor
+        else:
+            return x
+    def __repr__(self):
+        return "%s(embedding_dim=%d, vocab_size=%d)" % (
+            self.__class__.__name__,
+            self.embedding_dim,
+            self.vocab_size,
+        )
+class SpatialEmbeddings(nn.Module):
+    """
+    Simple Linear Projection Layer
+    (For encoder outputs to predict glosses)
+    """
+    # pylint: disable=unused-argument
+    def __init__(
+        self,
+        embedding_dim: int,
+        input_size: int,
+        num_heads: int,
+        freeze: bool = False,
+        norm_type: str = "batch",
+        activation_type: str = "softsign",
+        scale: bool = False,
+        scale_factor: float = None,
+        **kwargs
+    ):
+        """
+        Create new embeddings for the vocabulary.
+        Use scaling for the Transformer.
+        :param embedding_dim:
+        :param input_size:
+        :param freeze: freeze the embeddings during training
+        """
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.input_size = input_size
+        self.ln = nn.Linear(self.input_size, self.embedding_dim)
+        self.norm_type = norm_type
+        if self.norm_type:
+            self.norm = MaskedNorm(
+                norm_type=norm_type, num_groups=num_heads, num_features=embedding_dim
+            )
+        self.activation_type = activation_type
+        if self.activation_type:
+            self.activation = get_activation(activation_type)
+        self.scale = scale
+        if self.scale:
+            if scale_factor:
+                self.scale_factor = scale_factor
+            else:
+                self.scale_factor = math.sqrt(self.embedding_dim)
+        if freeze:
+            freeze_params(self)
+    # pylint: disable=arguments-differ
+    def forward(self, x: Tensor, mask: Tensor) -> Tensor:
+        """
+        :param mask: frame masks
+        :param x: input frame features
+        :return: embedded representation for `x`
+        """
+        x = self.ln(x)
+        if self.norm_type:
+            x = self.norm(x, mask)
+        if self.activation_type:
+            x = self.activation(x)
+        if self.scale:
+            return x * self.scale_factor
+        else:
+            return x
+    def __repr__(self):
+        return "%s(embedding_dim=%d, input_size=%d)" % (
+            self.__class__.__name__,
+            self.embedding_dim,
+            self.input_size,
+        )
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+    embeddings. :return: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+class TimestepEmbedding(nn.Module):
+    def __init__(self, channel: int, time_embed_dim: int, act_fn: str = "silu"):
+        super().__init__()
+        self.linear_1 = nn.Linear(channel, time_embed_dim)
+        self.act = None
+        if act_fn == "silu":
+            self.act = nn.SiLU()
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim)
+    def forward(self, sample):
+        sample = self.linear_1(sample)
+        if self.act is not None:
+            sample = self.act(sample)
+        sample = self.linear_2(sample)
+        return sample
+class Timesteps(nn.Module):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+        )
+        return t_emb

mGPT/archs/tools/quantize_cnn.py ADDED Viewed

	@@ -0,0 +1,414 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class QuantizeEMAReset(nn.Module):
+    def __init__(self, nb_code, code_dim, mu):
+        super().__init__()
+        self.nb_code = nb_code
+        self.code_dim = code_dim
+        self.mu = mu
+        self.reset_codebook()
+    def reset_codebook(self):
+        self.init = False
+        self.code_sum = None
+        self.code_count = None
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.register_buffer('codebook', torch.zeros(self.nb_code, self.code_dim).to(device))
+    def _tile(self, x):
+        nb_code_x, code_dim = x.shape
+        if nb_code_x < self.nb_code:
+            n_repeats = (self.nb_code + nb_code_x - 1) // nb_code_x
+            std = 0.01 / np.sqrt(code_dim)
+            out = x.repeat(n_repeats, 1)
+            out = out + torch.randn_like(out) * std
+        else :
+            out = x
+        return out
+    def init_codebook(self, x):
+        out = self._tile(x)
+        self.codebook = out[:self.nb_code]
+        self.code_sum = self.codebook.clone()
+        self.code_count = torch.ones(self.nb_code, device=self.codebook.device)
+        self.init = True
+    @torch.no_grad()
+    def compute_perplexity(self, code_idx) :
+        # Calculate new centres
+        code_onehot = torch.zeros(self.nb_code, code_idx.shape[0], device=code_idx.device)  # nb_code, N * L
+        code_onehot.scatter_(0, code_idx.view(1, code_idx.shape[0]), 1)
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+        prob = code_count / torch.sum(code_count)
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+        return perplexity
+    @torch.no_grad()
+    def update_codebook(self, x, code_idx):
+        code_onehot = torch.zeros(self.nb_code, x.shape[0], device=x.device)  # nb_code, N * L
+        code_onehot.scatter_(0, code_idx.view(1, x.shape[0]), 1)
+        code_sum = torch.matmul(code_onehot, x)  # nb_code, w
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+        out = self._tile(x)
+        code_rand = out[:self.nb_code]
+        # Update centres
+        self.code_sum = self.mu * self.code_sum + (1. - self.mu) * code_sum  # w, nb_code
+        self.code_count = self.mu * self.code_count + (1. - self.mu) * code_count  # nb_code
+        usage = (self.code_count.view(self.nb_code, 1) >= 1.0).float()
+        code_update = self.code_sum.view(self.nb_code, self.code_dim) / self.code_count.view(self.nb_code, 1)
+        self.codebook = usage * code_update + (1 - usage) * code_rand
+        prob = code_count / torch.sum(code_count)
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+        return perplexity
+    def preprocess(self, x):
+        # NCT -> NTC -> [NT, C]
+        x = x.permute(0, 2, 1).contiguous()
+        x = x.view(-1, x.shape[-1])
+        return x
+    def quantize(self, x):
+        # Calculate latent code x_l
+        k_w = self.codebook.t()
+        distance = torch.sum(x ** 2, dim=-1, keepdim=True) - 2 * torch.matmul(x, k_w) + torch.sum(k_w ** 2, dim=0,
+                                                                                            keepdim=True)  # (N * L, b)
+        _, code_idx = torch.min(distance, dim=-1)
+        return code_idx
+    def dequantize(self, code_idx):
+        x = F.embedding(code_idx, self.codebook)
+        return x
+    def forward(self, x):
+        N, width, T = x.shape
+        # Preprocess
+        x = self.preprocess(x)
+        # Init codebook if not inited
+        if self.training and not self.init:
+            self.init_codebook(x)
+        # quantize and dequantize through bottleneck
+        code_idx = self.quantize(x)
+        x_d = self.dequantize(code_idx)
+        # Update embeddings
+        if self.training:
+            perplexity = self.update_codebook(x, code_idx)
+        else :
+            perplexity = self.compute_perplexity(code_idx)
+        # Loss
+        commit_loss = F.mse_loss(x, x_d.detach())
+        # Passthrough
+        x_d = x + (x_d - x).detach()
+        # Postprocess
+        x_d = x_d.view(N, T, -1).permute(0, 2, 1).contiguous()   #(N, DIM, T)
+        return x_d, commit_loss, perplexity
+class Quantizer(nn.Module):
+    def __init__(self, n_e, e_dim, beta):
+        super(Quantizer, self).__init__()
+        self.e_dim = e_dim
+        self.n_e = n_e
+        self.beta = beta
+        self.embedding = nn.Embedding(self.n_e, self.e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+    def forward(self, z):
+        N, width, T = z.shape
+        z = self.preprocess(z)
+        assert z.shape[-1] == self.e_dim
+        z_flattened = z.contiguous().view(-1, self.e_dim)
+        # B x V
+        d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \
+            torch.sum(self.embedding.weight**2, dim=1) - 2 * \
+            torch.matmul(z_flattened, self.embedding.weight.t())
+        # B x 1
+        min_encoding_indices = torch.argmin(d, dim=1)
+        z_q = self.embedding(min_encoding_indices).view(z.shape)
+        # compute loss for embedding
+        loss = torch.mean((z_q - z.detach())**2) + self.beta * \
+               torch.mean((z_q.detach() - z)**2)
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+        z_q = z_q.view(N, T, -1).permute(0, 2, 1).contiguous()   #(N, DIM, T)
+        min_encodings = F.one_hot(min_encoding_indices, self.n_e).type(z.dtype)
+        e_mean = torch.mean(min_encodings, dim=0)
+        perplexity = torch.exp(-torch.sum(e_mean*torch.log(e_mean + 1e-10)))
+        return z_q, loss, perplexity
+    def quantize(self, z):
+        assert z.shape[-1] == self.e_dim
+        # B x V
+        d = torch.sum(z ** 2, dim=1, keepdim=True) + \
+            torch.sum(self.embedding.weight ** 2, dim=1) - 2 * \
+            torch.matmul(z, self.embedding.weight.t())
+        # B x 1
+        min_encoding_indices = torch.argmin(d, dim=1)
+        return min_encoding_indices
+    def dequantize(self, indices):
+        index_flattened = indices.view(-1)
+        z_q = self.embedding(index_flattened)
+        z_q = z_q.view(indices.shape + (self.e_dim, )).contiguous()
+        return z_q
+    def preprocess(self, x):
+        # NCT -> NTC -> [NT, C]
+        x = x.permute(0, 2, 1).contiguous()
+        x = x.view(-1, x.shape[-1])
+        return x
+class QuantizeReset(nn.Module):
+    def __init__(self, nb_code, code_dim):
+        super().__init__()
+        self.nb_code = nb_code
+        self.code_dim = code_dim
+        self.reset_codebook()
+        self.codebook = nn.Parameter(torch.randn(nb_code, code_dim))
+    def reset_codebook(self):
+        self.init = False
+        self.code_count = None
+    def _tile(self, x):
+        nb_code_x, code_dim = x.shape
+        if nb_code_x < self.nb_code:
+            n_repeats = (self.nb_code + nb_code_x - 1) // nb_code_x
+            std = 0.01 / np.sqrt(code_dim)
+            out = x.repeat(n_repeats, 1)
+            out = out + torch.randn_like(out) * std
+        else :
+            out = x
+        return out
+    def init_codebook(self, x):
+        out = self._tile(x)
+        self.codebook = nn.Parameter(out[:self.nb_code])
+        self.code_count = torch.ones(self.nb_code, device=self.codebook.device)
+        self.init = True
+    @torch.no_grad()
+    def compute_perplexity(self, code_idx) :
+        # Calculate new centres
+        code_onehot = torch.zeros(self.nb_code, code_idx.shape[0], device=code_idx.device)  # nb_code, N * L
+        code_onehot.scatter_(0, code_idx.view(1, code_idx.shape[0]), 1)
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+        prob = code_count / torch.sum(code_count)
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+        return perplexity
+    def update_codebook(self, x, code_idx):
+        code_onehot = torch.zeros(self.nb_code, x.shape[0], device=x.device)  # nb_code, N * L
+        code_onehot.scatter_(0, code_idx.view(1, x.shape[0]), 1)
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+        out = self._tile(x)
+        code_rand = out[:self.nb_code]
+        # Update centres
+        self.code_count = code_count  # nb_code
+        usage = (self.code_count.view(self.nb_code, 1) >= 1.0).float()
+        self.codebook.data = usage * self.codebook.data + (1 - usage) * code_rand
+        prob = code_count / torch.sum(code_count)
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+        return perplexity
+    def preprocess(self, x):
+        # NCT -> NTC -> [NT, C]
+        x = x.permute(0, 2, 1).contiguous()
+        x = x.view(-1, x.shape[-1])
+        return x
+    def quantize(self, x):
+        # Calculate latent code x_l
+        k_w = self.codebook.t()
+        distance = torch.sum(x ** 2, dim=-1, keepdim=True) - 2 * torch.matmul(x, k_w) + torch.sum(k_w ** 2, dim=0,
+                                                                                            keepdim=True)  # (N * L, b)
+        _, code_idx = torch.min(distance, dim=-1)
+        return code_idx
+    def dequantize(self, code_idx):
+        x = F.embedding(code_idx, self.codebook)
+        return x
+    def forward(self, x):
+        N, width, T = x.shape
+        # Preprocess
+        x = self.preprocess(x)
+        # Init codebook if not inited
+        if self.training and not self.init:
+            self.init_codebook(x)
+        # quantize and dequantize through bottleneck
+        code_idx = self.quantize(x)
+        x_d = self.dequantize(code_idx)
+        # Update embeddings
+        if self.training:
+            perplexity = self.update_codebook(x, code_idx)
+        else :
+            perplexity = self.compute_perplexity(code_idx)
+        # Loss
+        commit_loss = F.mse_loss(x, x_d.detach())
+        # Passthrough
+        x_d = x + (x_d - x).detach()
+        # Postprocess
+        x_d = x_d.view(N, T, -1).permute(0, 2, 1).contiguous()   #(N, DIM, T)
+        return x_d, commit_loss, perplexity
+class QuantizeEMA(nn.Module):
+    def __init__(self, nb_code, code_dim, mu):
+        super().__init__()
+        self.nb_code = nb_code
+        self.code_dim = code_dim
+        self.mu = mu
+        self.reset_codebook()
+    def reset_codebook(self):
+        self.init = False
+        self.code_sum = None
+        self.code_count = None
+        self.register_buffer('codebook', torch.zeros(self.nb_code, self.code_dim).cuda())
+    def _tile(self, x):
+        nb_code_x, code_dim = x.shape
+        if nb_code_x < self.nb_code:
+            n_repeats = (self.nb_code + nb_code_x - 1) // nb_code_x
+            std = 0.01 / np.sqrt(code_dim)
+            out = x.repeat(n_repeats, 1)
+            out = out + torch.randn_like(out) * std
+        else :
+            out = x
+        return out
+    def init_codebook(self, x):
+        out = self._tile(x)
+        self.codebook = out[:self.nb_code]
+        self.code_sum = self.codebook.clone()
+        self.code_count = torch.ones(self.nb_code, device=self.codebook.device)
+        self.init = True
+    @torch.no_grad()
+    def compute_perplexity(self, code_idx) :
+        # Calculate new centres
+        code_onehot = torch.zeros(self.nb_code, code_idx.shape[0], device=code_idx.device)  # nb_code, N * L
+        code_onehot.scatter_(0, code_idx.view(1, code_idx.shape[0]), 1)
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+        prob = code_count / torch.sum(code_count)
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+        return perplexity
+    @torch.no_grad()
+    def update_codebook(self, x, code_idx):
+        code_onehot = torch.zeros(self.nb_code, x.shape[0], device=x.device)  # nb_code, N * L
+        code_onehot.scatter_(0, code_idx.view(1, x.shape[0]), 1)
+        code_sum = torch.matmul(code_onehot, x)  # nb_code, w
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+        # Update centres
+        self.code_sum = self.mu * self.code_sum + (1. - self.mu) * code_sum  # w, nb_code
+        self.code_count = self.mu * self.code_count + (1. - self.mu) * code_count  # nb_code
+        code_update = self.code_sum.view(self.nb_code, self.code_dim) / self.code_count.view(self.nb_code, 1)
+        self.codebook = code_update
+        prob = code_count / torch.sum(code_count)
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+        return perplexity
+    def preprocess(self, x):
+        # NCT -> NTC -> [NT, C]
+        x = x.permute(0, 2, 1).contiguous()
+        x = x.view(-1, x.shape[-1])
+        return x
+    def quantize(self, x):
+        # Calculate latent code x_l
+        k_w = self.codebook.t()
+        distance = torch.sum(x ** 2, dim=-1, keepdim=True) - 2 * torch.matmul(x, k_w) + torch.sum(k_w ** 2, dim=0,
+                                                                                            keepdim=True)  # (N * L, b)
+        _, code_idx = torch.min(distance, dim=-1)
+        return code_idx
+    def dequantize(self, code_idx):
+        x = F.embedding(code_idx, self.codebook)
+        return x
+    def forward(self, x):
+        N, width, T = x.shape
+        # Preprocess
+        x = self.preprocess(x)
+        # Init codebook if not inited
+        if self.training and not self.init:
+            self.init_codebook(x)
+        # quantize and dequantize through bottleneck
+        code_idx = self.quantize(x)
+        x_d = self.dequantize(code_idx)
+        # Update embeddings
+        if self.training:
+            perplexity = self.update_codebook(x, code_idx)
+        else :
+            perplexity = self.compute_perplexity(code_idx)
+        # Loss
+        commit_loss = F.mse_loss(x, x_d.detach())
+        # Passthrough
+        x_d = x + (x_d - x).detach()
+        # Postprocess
+        x_d = x_d.view(N, T, -1).permute(0, 2, 1).contiguous()   #(N, DIM, T)
+        return x_d, commit_loss, perplexity

mGPT/archs/tools/resnet.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import torch.nn as nn
+import torch
+class nonlinearity(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        # swish
+        return x * torch.sigmoid(x)
+class ResConv1DBlock(nn.Module):
+    def __init__(self, n_in, n_state, dilation=1, activation='silu', norm=None, dropout=None):
+        super().__init__()
+        padding = dilation
+        self.norm = norm
+        if norm == "LN":
+            self.norm1 = nn.LayerNorm(n_in)
+            self.norm2 = nn.LayerNorm(n_in)
+        elif norm == "GN":
+            self.norm1 = nn.GroupNorm(num_groups=32, num_channels=n_in, eps=1e-6, affine=True)
+            self.norm2 = nn.GroupNorm(num_groups=32, num_channels=n_in, eps=1e-6, affine=True)
+        elif norm == "BN":
+            self.norm1 = nn.BatchNorm1d(num_features=n_in, eps=1e-6, affine=True)
+            self.norm2 = nn.BatchNorm1d(num_features=n_in, eps=1e-6, affine=True)
+        else:
+            self.norm1 = nn.Identity()
+            self.norm2 = nn.Identity()
+        if activation == "relu":
+            self.activation1 = nn.ReLU()
+            self.activation2 = nn.ReLU()
+        elif activation == "silu":
+            self.activation1 = nonlinearity()
+            self.activation2 = nonlinearity()
+        elif activation == "gelu":
+            self.activation1 = nn.GELU()
+            self.activation2 = nn.GELU()
+        self.conv1 = nn.Conv1d(n_in, n_state, 3, 1, padding, dilation)
+        self.conv2 = nn.Conv1d(n_state, n_in, 1, 1, 0,)
+    def forward(self, x):
+        x_orig = x
+        if self.norm == "LN":
+            x = self.norm1(x.transpose(-2, -1))
+            x = self.activation1(x.transpose(-2, -1))
+        else:
+            x = self.norm1(x)
+            x = self.activation1(x)
+        x = self.conv1(x)
+        if self.norm == "LN":
+            x = self.norm2(x.transpose(-2, -1))
+            x = self.activation2(x.transpose(-2, -1))
+        else:
+            x = self.norm2(x)
+            x = self.activation2(x)
+        x = self.conv2(x)
+        x = x + x_orig
+        return x
+class Resnet1D(nn.Module):
+    def __init__(self, n_in, n_depth, dilation_growth_rate=1, reverse_dilation=True, activation='relu', norm=None):
+        super().__init__()
+        blocks = [ResConv1DBlock(n_in, n_in, dilation=dilation_growth_rate ** depth, activation=activation, norm=norm) for depth in range(n_depth)]
+        if reverse_dilation:
+            blocks = blocks[::-1]
+        self.model = nn.Sequential(*blocks)
+    def forward(self, x):
+        return self.model(x)

mGPT/archs/tools/token_emb.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from torch import Tensor, nn
+class NewTokenEmb(nn.Module):
+    """
+    For adding new tokens to a pretrained model
+    """
+    def __init__(self,
+                 old_embeddings: nn.Embedding,
+                 new_num_tokens: int = None) -> None:
+        super().__init__()
+        self.num_tokens = old_embeddings.num_embeddings + new_num_tokens
+        self.old_num_tokens = old_embeddings.num_embeddings
+        self.new_num_tokens = new_num_tokens
+        self.embedding_dim = old_embeddings.embedding_dim
+        # For text embeddings
+        self.text_embeddings = nn.Embedding(
+            self.num_tokens,
+            self.embedding_dim,
+            device=old_embeddings.weight.device,
+            dtype=old_embeddings.weight.dtype)
+        with torch.no_grad():
+            self.text_embeddings.weight.data[:old_embeddings.
+                                             num_embeddings] = old_embeddings.weight.data
+            self.text_embeddings.weight.data[
+                self.old_num_tokens:] = torch.zeros(
+                    self.new_num_tokens,
+                    self.embedding_dim,
+                    dtype=old_embeddings.weight.dtype,
+                    device=old_embeddings.weight.device)
+        self.text_embeddings.weight.requires_grad_(False)
+        # For motion embeddings
+        self.motion_embeddings = nn.Embedding(
+            new_num_tokens,
+            self.embedding_dim,
+            device=old_embeddings.weight.device,
+            dtype=old_embeddings.weight.dtype)
+        with torch.no_grad():
+            self.motion_embeddings.weight.data[:self.
+                                               old_num_tokens] = torch.zeros(
+                                                   new_num_tokens,
+                                                   self.embedding_dim,
+                                                   dtype=old_embeddings.weight.
+                                                   dtype,
+                                                   device=old_embeddings.
+                                                   weight.device)
+        self.word2motionProj = nn.Linear(self.old_num_tokens, new_num_tokens)
+    def forward(self, input: Tensor) -> Tensor:
+        with torch.no_grad():
+            self.motion_embeddings.weight.data[:self.
+                                               old_num_tokens] = torch.zeros(
+                                                   self.new_num_tokens,
+                                                   self.embedding_dim,
+                                                   dtype=self.motion_embeddings
+                                                   .weight.dtype,
+                                                   device=self.
+                                                   motion_embeddings.weight.
+                                                   device)
+        self.motion_embeddings.weight.data[
+            self.old_num_tokens:] = self.word2motionProj(
+                self.text_embeddings.weight.data[:self.old_num_tokens].permute(
+                    1, 0)).permute(1, 0)
+        return self.text_embeddings(input) + self.motion_embeddings(input)

mGPT/archs/tools/transformer_layers.py ADDED Viewed

	@@ -0,0 +1,285 @@

+# -*- coding: utf-8 -*-
+import math
+import torch
+import torch.nn as nn
+from torch import Tensor
+# Took from https://github.com/joeynmt/joeynmt/blob/fb66afcbe1beef9acd59283bcc084c4d4c1e6343/joeynmt/transformer_layers.py
+# pylint: disable=arguments-differ
+class MultiHeadedAttention(nn.Module):
+    """
+    Multi-Head Attention module from "Attention is All You Need"
+    Implementation modified from OpenNMT-py.
+    https://github.com/OpenNMT/OpenNMT-py
+    """
+    def __init__(self, num_heads: int, size: int, dropout: float = 0.1):
+        """
+        Create a multi-headed attention layer.
+        :param num_heads: the number of heads
+        :param size: model size (must be divisible by num_heads)
+        :param dropout: probability of dropping a unit
+        """
+        super().__init__()
+        assert size % num_heads == 0
+        self.head_size = head_size = size // num_heads
+        self.model_size = size
+        self.num_heads = num_heads
+        self.k_layer = nn.Linear(size, num_heads * head_size)
+        self.v_layer = nn.Linear(size, num_heads * head_size)
+        self.q_layer = nn.Linear(size, num_heads * head_size)
+        self.output_layer = nn.Linear(size, size)
+        self.softmax = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, k: Tensor, v: Tensor, q: Tensor, mask: Tensor = None):
+        """
+        Computes multi-headed attention.
+        :param k: keys   [B, M, D] with M being the sentence length.
+        :param v: values [B, M, D]
+        :param q: query  [B, M, D]
+        :param mask: optional mask [B, 1, M] or [B, M, M]
+        :return:
+        """
+        batch_size = k.size(0)
+        num_heads = self.num_heads
+        # project the queries (q), keys (k), and values (v)
+        k = self.k_layer(k)
+        v = self.v_layer(v)
+        q = self.q_layer(q)
+        # reshape q, k, v for our computation to [batch_size, num_heads, ..]
+        k = k.view(batch_size, -1, num_heads, self.head_size).transpose(1, 2)
+        v = v.view(batch_size, -1, num_heads, self.head_size).transpose(1, 2)
+        q = q.view(batch_size, -1, num_heads, self.head_size).transpose(1, 2)
+        # compute scores
+        q = q / math.sqrt(self.head_size)
+        # batch x num_heads x query_len x key_len
+        scores = torch.matmul(q, k.transpose(2, 3))
+        # torch.Size([48, 8, 183, 183])
+        # apply the mask (if we have one)
+        # we add a dimension for the heads to it below: [B, 1, 1, M]
+        if mask is not None:
+            scores = scores.masked_fill(~mask.unsqueeze(1), float('-inf'))
+        # apply attention dropout and compute context vectors.
+        attention = self.softmax(scores)
+        attention = self.dropout(attention)
+        # torch.Size([48, 8, 183, 183]) [bs, nheads, time, time] (for decoding)
+        # v: torch.Size([48, 8, 183, 32]) (32 is 256/8)
+        # get context vector (select values with attention) and reshape
+        # back to [B, M, D]
+        context = torch.matmul(attention, v)  # torch.Size([48, 8, 183, 32])
+        context = context.transpose(1, 2).contiguous().view(
+            batch_size, -1, num_heads * self.head_size)
+        # torch.Size([48, 183, 256]) put back to 256 (combine the heads)
+        output = self.output_layer(context)
+        # torch.Size([48, 183, 256]): 1 output per time step
+        return output
+# pylint: disable=arguments-differ
+class PositionwiseFeedForward(nn.Module):
+    """
+    Position-wise Feed-forward layer
+    Projects to ff_size and then back down to input_size.
+    """
+    def __init__(self, input_size, ff_size, dropout=0.1):
+        """
+        Initializes position-wise feed-forward layer.
+        :param input_size: dimensionality of the input.
+        :param ff_size: dimensionality of intermediate representation
+        :param dropout:
+        """
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(input_size, eps=1e-6)
+        self.pwff_layer = nn.Sequential(
+            nn.Linear(input_size, ff_size),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(ff_size, input_size),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x):
+        x_norm = self.layer_norm(x)
+        return self.pwff_layer(x_norm) + x
+# pylint: disable=arguments-differ
+class PositionalEncoding(nn.Module):
+    """
+    Pre-compute position encodings (PE).
+    In forward pass, this adds the position-encodings to the
+    input for as many time steps as necessary.
+    Implementation based on OpenNMT-py.
+    https://github.com/OpenNMT/OpenNMT-py
+    """
+    def __init__(self, size: int = 0, max_len: int = 5000):
+        """
+        Positional Encoding with maximum length max_len
+        :param size:
+        :param max_len:
+        :param dropout:
+        """
+        if size % 2 != 0:
+            raise ValueError("Cannot use sin/cos positional encoding with "
+                             "odd dim (got dim={:d})".format(size))
+        pe = torch.zeros(max_len, size)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp((torch.arange(0, size, 2, dtype=torch.float) *
+                              -(math.log(10000.0) / size)))
+        pe[:, 0::2] = torch.sin(position.float() * div_term)
+        pe[:, 1::2] = torch.cos(position.float() * div_term)
+        pe = pe.unsqueeze(0)  # shape: [1, size, max_len]
+        super().__init__()
+        self.register_buffer('pe', pe)
+        self.dim = size
+    def forward(self, emb):
+        """Embed inputs.
+        Args:
+            emb (FloatTensor): Sequence of word vectors
+                ``(seq_len, batch_size, self.dim)``
+        """
+        # Add position encodings
+        return emb + self.pe[:, :emb.size(1)]
+class TransformerEncoderLayer(nn.Module):
+    """
+    One Transformer encoder layer has a Multi-head attention layer plus
+    a position-wise feed-forward layer.
+    """
+    def __init__(self,
+                 size: int = 0,
+                 ff_size: int = 0,
+                 num_heads: int = 0,
+                 dropout: float = 0.1):
+        """
+        A single Transformer layer.
+        :param size:
+        :param ff_size:
+        :param num_heads:
+        :param dropout:
+        """
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(size, eps=1e-6)
+        self.src_src_att = MultiHeadedAttention(num_heads,
+                                                size,
+                                                dropout=dropout)
+        self.feed_forward = PositionwiseFeedForward(size,
+                                                    ff_size=ff_size,
+                                                    dropout=dropout)
+        self.dropout = nn.Dropout(dropout)
+        self.size = size
+    # pylint: disable=arguments-differ
+    def forward(self, x: Tensor, mask: Tensor) -> Tensor:
+        """
+        Forward pass for a single transformer encoder layer.
+        First applies layer norm, then self attention,
+        then dropout with residual connection (adding the input to the result),
+        and then a position-wise feed-forward layer.
+        :param x: layer input
+        :param mask: input mask
+        :return: output tensor
+        """
+        x_norm = self.layer_norm(x)
+        h = self.src_src_att(x_norm, x_norm, x_norm, mask)
+        h = self.dropout(h) + x
+        o = self.feed_forward(h)
+        return o
+class TransformerDecoderLayer(nn.Module):
+    """
+    Transformer decoder layer.
+    Consists of self-attention, source-attention, and feed-forward.
+    """
+    def __init__(self,
+                 size: int = 0,
+                 ff_size: int = 0,
+                 num_heads: int = 0,
+                 dropout: float = 0.1):
+        """
+        Represents a single Transformer decoder layer.
+        It attends to the source representation and the previous decoder states.
+        :param size: model dimensionality
+        :param ff_size: size of the feed-forward intermediate layer
+        :param num_heads: number of heads
+        :param dropout: dropout to apply to input
+        """
+        super().__init__()
+        self.size = size
+        self.trg_trg_att = MultiHeadedAttention(num_heads,
+                                                size,
+                                                dropout=dropout)
+        self.src_trg_att = MultiHeadedAttention(num_heads,
+                                                size,
+                                                dropout=dropout)
+        self.feed_forward = PositionwiseFeedForward(size,
+                                                    ff_size=ff_size,
+                                                    dropout=dropout)
+        self.x_layer_norm = nn.LayerNorm(size, eps=1e-6)
+        self.dec_layer_norm = nn.LayerNorm(size, eps=1e-6)
+        self.dropout = nn.Dropout(dropout)
+    # pylint: disable=arguments-differ
+    def forward(self,
+                x: Tensor = None,
+                memory: Tensor = None,
+                src_mask: Tensor = None,
+                trg_mask: Tensor = None) -> Tensor:
+        """
+        Forward pass of a single Transformer decoder layer.
+        :param x: inputs
+        :param memory: source representations
+        :param src_mask: source mask
+        :param trg_mask: target mask (so as to not condition on future steps)
+        :return: output tensor
+        """
+        # decoder/target self-attention
+        x_norm = self.x_layer_norm(x)  # torch.Size([48, 183, 256])
+        h1 = self.trg_trg_att(x_norm, x_norm, x_norm, mask=trg_mask)
+        h1 = self.dropout(h1) + x
+        # source-target attention
+        h1_norm = self.dec_layer_norm(
+            h1)  # torch.Size([48, 183, 256]) (same for memory)
+        h2 = self.src_trg_att(memory, memory, h1_norm, mask=src_mask)
+        # final position-wise feed-forward layer
+        o = self.feed_forward(self.dropout(h2) + h1)
+        return o

mGPT/callback.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import os
+from pytorch_lightning import LightningModule, Trainer
+from pytorch_lightning.callbacks import Callback, RichProgressBar, ModelCheckpoint
+def build_callbacks(cfg, logger=None, phase='test', **kwargs):
+    callbacks = []
+    logger = logger
+    # Rich Progress Bar
+    callbacks.append(progressBar())
+    # Checkpoint Callback
+    if phase == 'train':
+        callbacks.extend(getCheckpointCallback(cfg, logger=logger, **kwargs))
+    return callbacks
+def getCheckpointCallback(cfg, logger=None, **kwargs):
+    callbacks = []
+    # Logging
+    metric_monitor = {
+        "loss_total": "total/train",
+        "Train_jf": "recons/text2jfeats/train",
+        "Val_jf": "recons/text2jfeats/val",
+        "Train_rf": "recons/text2rfeats/train",
+        "Val_rf": "recons/text2rfeats/val",
+        "APE root": "Metrics/APE_root",
+        "APE mean pose": "Metrics/APE_mean_pose",
+        "AVE root": "Metrics/AVE_root",
+        "AVE mean pose": "Metrics/AVE_mean_pose",
+        "R_TOP_1": "Metrics/R_precision_top_1",
+        "R_TOP_2": "Metrics/R_precision_top_2",
+        "R_TOP_3": "Metrics/R_precision_top_3",
+        "gt_R_TOP_3": "Metrics/gt_R_precision_top_3",
+        "FID": "Metrics/FID",
+        "gt_FID": "Metrics/gt_FID",
+        "Diversity": "Metrics/Diversity",
+        "MM dist": "Metrics/Matching_score",
+        "Accuracy": "Metrics/accuracy",
+    }
+    callbacks.append(
+        progressLogger(logger,metric_monitor=metric_monitor,log_every_n_steps=1))
+    # Save 10 latest checkpoints
+    checkpointParams = {
+        'dirpath': os.path.join(cfg.FOLDER_EXP, "checkpoints"),
+        'filename': "{epoch}",
+        'monitor': "step",
+        'mode': "max",
+        'every_n_epochs': cfg.LOGGER.VAL_EVERY_STEPS,
+        'save_top_k': 8,
+        'save_last': True,
+        'save_on_train_epoch_end': True
+    }
+    callbacks.append(ModelCheckpoint(**checkpointParams))
+    # Save checkpoint every n*10 epochs
+    checkpointParams.update({
+        'every_n_epochs':
+        cfg.LOGGER.VAL_EVERY_STEPS * 10,
+        'save_top_k':
+        -1,
+        'save_last':
+        False
+    })
+    callbacks.append(ModelCheckpoint(**checkpointParams))
+    metrics = cfg.METRIC.TYPE
+    metric_monitor_map = {
+        'TemosMetric': {
+            'Metrics/APE_root': {
+                'abbr': 'APEroot',
+                'mode': 'min'
+            },
+        },
+        'TM2TMetrics': {
+            'Metrics/FID': {
+                'abbr': 'FID',
+                'mode': 'min'
+            },
+            'Metrics/R_precision_top_3': {
+                'abbr': 'R3',
+                'mode': 'max'
+            }
+        },
+        'MRMetrics': {
+            'Metrics/MPJPE': {
+                'abbr': 'MPJPE',
+                'mode': 'min'
+            }
+        },
+        'HUMANACTMetrics': {
+            'Metrics/Accuracy': {
+                'abbr': 'Accuracy',
+                'mode': 'max'
+            }
+        },
+        'UESTCMetrics': {
+            'Metrics/Accuracy': {
+                'abbr': 'Accuracy',
+                'mode': 'max'
+            }
+        },
+        'UncondMetrics': {
+            'Metrics/FID': {
+                'abbr': 'FID',
+                'mode': 'min'
+            }
+        }
+    }
+    checkpointParams.update({
+        'every_n_epochs': cfg.LOGGER.VAL_EVERY_STEPS,
+        'save_top_k': 1,
+    })
+    for metric in metrics:
+        if metric in metric_monitor_map.keys():
+            metric_monitors = metric_monitor_map[metric]
+            # Delete R3 if training VAE
+            if cfg.TRAIN.STAGE == 'vae' and metric == 'TM2TMetrics':
+                del metric_monitors['Metrics/R_precision_top_3']
+            for metric_monitor in metric_monitors:
+                checkpointParams.update({
+                    'filename':
+                    metric_monitor_map[metric][metric_monitor]['mode']
+                    + "-" +
+                    metric_monitor_map[metric][metric_monitor]['abbr']
+                    + "{ep}",
+                    'monitor':
+                    metric_monitor,
+                    'mode':
+                    metric_monitor_map[metric][metric_monitor]['mode'],
+                })
+                callbacks.append(
+                    ModelCheckpoint(**checkpointParams))
+    return callbacks
+class progressBar(RichProgressBar):
+    def __init__(self, ):
+        super().__init__()
+    def get_metrics(self, trainer, model):
+        # Don't show the version number
+        items = super().get_metrics(trainer, model)
+        items.pop("v_num", None)
+        return items
+class progressLogger(Callback):
+    def __init__(self,
+                 logger,
+                 metric_monitor: dict,
+                 precision: int = 3,
+                 log_every_n_steps: int = 1):
+        # Metric to monitor
+        self.logger = logger
+        self.metric_monitor = metric_monitor
+        self.precision = precision
+        self.log_every_n_steps = log_every_n_steps
+    def on_train_start(self, trainer: Trainer, pl_module: LightningModule,
+                       **kwargs) -> None:
+        self.logger.info("Training started")
+    def on_train_end(self, trainer: Trainer, pl_module: LightningModule,
+                     **kwargs) -> None:
+        self.logger.info("Training done")
+    def on_validation_epoch_end(self, trainer: Trainer,
+                                pl_module: LightningModule, **kwargs) -> None:
+        if trainer.sanity_checking:
+            self.logger.info("Sanity checking ok.")
+    def on_train_epoch_end(self,
+                           trainer: Trainer,
+                           pl_module: LightningModule,
+                           padding=False,
+                           **kwargs) -> None:
+        metric_format = f"{{:.{self.precision}e}}"
+        line = f"Epoch {trainer.current_epoch}"
+        if padding:
+            line = f"{line:>{len('Epoch xxxx')}}"  # Right padding
+        if trainer.current_epoch % self.log_every_n_steps == 0:
+            metrics_str = []
+            losses_dict = trainer.callback_metrics
+            for metric_name, dico_name in self.metric_monitor.items():
+                if dico_name in losses_dict:
+                    metric = losses_dict[dico_name].item()
+                    metric = metric_format.format(metric)
+                    metric = f"{metric_name} {metric}"
+                    metrics_str.append(metric)
+            line = line + ": " + "   ".join(metrics_str)
+        self.logger.info(line)

mGPT/config.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import importlib
+from argparse import ArgumentParser
+from omegaconf import OmegaConf
+from os.path import join as pjoin
+import os
+import glob
+def get_module_config(cfg, filepath="./configs"):
+    """
+    Load yaml config files from subfolders
+    """
+    yamls = glob.glob(pjoin(filepath, '*', '*.yaml'))
+    yamls = [y.replace(filepath, '') for y in yamls]
+    for yaml in yamls:
+        nodes = yaml.replace('.yaml', '').replace('/', '.')
+        nodes = nodes[1:] if nodes[0] == '.' else nodes
+        OmegaConf.update(cfg, nodes, OmegaConf.load('./configs' + yaml))
+    return cfg
+def get_obj_from_str(string, reload=False):
+    """
+    Get object from string
+    """
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+def instantiate_from_config(config):
+    """
+    Instantiate object from config
+    """
+    if not "target" in config:
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+def resume_config(cfg: OmegaConf):
+    """
+    Resume model and wandb
+    """
+    if cfg.TRAIN.RESUME:
+        resume = cfg.TRAIN.RESUME
+        if os.path.exists(resume):
+            # Checkpoints
+            cfg.TRAIN.PRETRAINED = pjoin(resume, "checkpoints", "last.ckpt")
+            # Wandb
+            wandb_files = os.listdir(pjoin(resume, "wandb", "latest-run"))
+            wandb_run = [item for item in wandb_files if "run-" in item][0]
+            cfg.LOGGER.WANDB.params.id = wandb_run.replace("run-","").replace(".wandb", "")
+        else:
+            raise ValueError("Resume path is not right.")
+    return cfg
+def parse_args(phase="train"):
+    """
+    Parse arguments and load config files
+    """
+    parser = ArgumentParser()
+    group = parser.add_argument_group("Training options")
+    # Assets
+    group.add_argument(
+        "--cfg_assets",
+        type=str,
+        required=False,
+        default="./configs/assets.yaml",
+        help="config file for asset paths",
+    )
+    # Default config
+    if phase in ["train", "test"]:
+        cfg_defualt = "./configs/default.yaml"
+    elif phase == "render":
+        cfg_defualt = "./configs/render.yaml"
+    elif phase == "webui":
+        cfg_defualt = "./configs/webui.yaml"
+    group.add_argument(
+        "--cfg",
+        type=str,
+        required=False,
+        default=cfg_defualt,
+        help="config file",
+    )
+    # Parse for each phase
+    if phase in ["train", "test"]:
+        group.add_argument("--batch_size",
+                           type=int,
+                           required=False,
+                           help="training batch size")
+        group.add_argument("--num_nodes",
+                           type=int,
+                           required=False,
+                           help="number of nodes")
+        group.add_argument("--device",
+                           type=int,
+                           nargs="+",
+                           required=False,
+                           help="training device")
+        group.add_argument("--task",
+                           type=str,
+                           required=False,
+                           help="evaluation task type")
+        group.add_argument("--nodebug",
+                           action="store_true",
+                           required=False,
+                           help="debug or not")
+    if phase == "demo":
+        group.add_argument(
+            "--example",
+            type=str,
+            required=False,
+            help="input text and lengths with txt format",
+        )
+        group.add_argument(
+            "--out_dir",
+            type=str,
+            required=False,
+            help="output dir",
+        )
+        group.add_argument("--task",
+                    type=str,
+                    required=False,
+                    help="evaluation task type")
+    if phase == "render":
+        group.add_argument("--npy",
+                           type=str,
+                           required=False,
+                           default=None,
+                           help="npy motion files")
+        group.add_argument("--dir",
+                           type=str,
+                           required=False,
+                           default=None,
+                           help="npy motion folder")
+        group.add_argument("--fps",
+                    type=int,
+                    required=False,
+                    default=30,
+                    help="render fps")
+        group.add_argument(
+            "--mode",
+            type=str,
+            required=False,
+            default="sequence",
+            help="render target: video, sequence, frame",
+        )
+    params = parser.parse_args()
+    # Load yaml config files
+    OmegaConf.register_new_resolver("eval", eval)
+    cfg_assets = OmegaConf.load(params.cfg_assets)
+    cfg_base = OmegaConf.load(pjoin(cfg_assets.CONFIG_FOLDER, 'default.yaml'))
+    cfg_exp = OmegaConf.merge(cfg_base, OmegaConf.load(params.cfg))
+    if not cfg_exp.FULL_CONFIG:
+        cfg_exp = get_module_config(cfg_exp, cfg_assets.CONFIG_FOLDER)
+    cfg = OmegaConf.merge(cfg_exp, cfg_assets)
+    # Update config with arguments
+    if phase in ["train", "test"]:
+        cfg.TRAIN.BATCH_SIZE = params.batch_size if params.batch_size else cfg.TRAIN.BATCH_SIZE
+        cfg.DEVICE = params.device if params.device else cfg.DEVICE
+        cfg.NUM_NODES = params.num_nodes if params.num_nodes else cfg.NUM_NODES
+        cfg.model.params.task = params.task if params.task else cfg.model.params.task
+        cfg.DEBUG = not params.nodebug if params.nodebug is not None else cfg.DEBUG
+        # Force no debug in test
+        if phase == "test":
+            cfg.DEBUG = False
+            cfg.DEVICE = [0]
+            print("Force no debugging and one gpu when testing")
+    if phase == "demo":
+        cfg.DEMO.RENDER = params.render
+        cfg.DEMO.FRAME_RATE = params.frame_rate
+        cfg.DEMO.EXAMPLE = params.example
+        cfg.DEMO.TASK = params.task
+        cfg.TEST.FOLDER = params.out_dir if params.out_dir else cfg.TEST.FOLDER
+        os.makedirs(cfg.TEST.FOLDER, exist_ok=True)
+    if phase == "render":
+        if params.npy:
+            cfg.RENDER.NPY = params.npy
+            cfg.RENDER.INPUT_MODE = "npy"
+        if params.dir:
+            cfg.RENDER.DIR = params.dir
+            cfg.RENDER.INPUT_MODE = "dir"
+        if params.fps:
+            cfg.RENDER.FPS = float(params.fps)
+        cfg.RENDER.MODE = params.mode
+    # Debug mode
+    if cfg.DEBUG:
+        cfg.NAME = "debug--" + cfg.NAME
+        cfg.LOGGER.WANDB.params.offline = True
+        cfg.LOGGER.VAL_EVERY_STEPS = 1
+    # Resume config
+    cfg = resume_config(cfg)
+    return cfg

mGPT/data/HumanML3D.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import numpy as np
+import torch
+from os.path import join as pjoin
+from .humanml.utils.word_vectorizer import WordVectorizer
+from .humanml.scripts.motion_process import (process_file, recover_from_ric)
+from . import BASEDataModule
+from .humanml import Text2MotionDatasetEval, Text2MotionDataset, Text2MotionDatasetCB, MotionDataset, MotionDatasetVQ, Text2MotionDatasetToken, Text2MotionDatasetM2T
+from .utils import humanml3d_collate
+class HumanML3DDataModule(BASEDataModule):
+    def __init__(self, cfg, **kwargs):
+        super().__init__(collate_fn=humanml3d_collate)
+        self.cfg = cfg
+        self.save_hyperparameters(logger=False)
+        # Basic info of the dataset
+        cfg.DATASET.JOINT_TYPE = 'humanml3d'
+        self.name = "humanml3d"
+        self.njoints = 22
+        # Path to the dataset
+        data_root = cfg.DATASET.HUMANML3D.ROOT
+        self.hparams.data_root = data_root
+        self.hparams.text_dir = pjoin(data_root, "texts")
+        self.hparams.motion_dir = pjoin(data_root, 'new_joint_vecs')
+        # Mean and std of the dataset
+        self.hparams.mean = np.load(pjoin('assets/meta', "mean.npy"))
+        self.hparams.std = np.load(pjoin('assets/meta', "std.npy"))
+        # Mean and std for fair evaluation
+        self.hparams.mean_eval = np.load(pjoin('assets/meta', "mean_eval.npy"))
+        self.hparams.std_eval = np.load(pjoin('assets/meta', "std_eval.npy"))
+        # Length of the dataset
+        self.hparams.max_motion_length = cfg.DATASET.HUMANML3D.MAX_MOTION_LEN
+        self.hparams.min_motion_length = cfg.DATASET.HUMANML3D.MIN_MOTION_LEN
+        self.hparams.max_text_len = cfg.DATASET.HUMANML3D.MAX_TEXT_LEN
+        self.hparams.unit_length = cfg.DATASET.HUMANML3D.UNIT_LEN
+        # Additional parameters
+        self.hparams.debug = cfg.DEBUG
+        self.hparams.stage = cfg.TRAIN.STAGE
+        # Dataset switch
+        self.DatasetEval = Text2MotionDatasetEval
+        if cfg.TRAIN.STAGE == "vae":
+            if cfg.model.params.motion_vae.target.split('.')[-1].lower() == "vqvae":
+                self.hparams.win_size = 64
+                self.Dataset = MotionDatasetVQ
+            else:
+                self.Dataset = MotionDataset
+        elif 'lm' in cfg.TRAIN.STAGE:
+            self.hparams.code_path = cfg.DATASET.CODE_PATH
+            self.hparams.task_path = cfg.DATASET.TASK_PATH
+            self.hparams.std_text = cfg.DATASET.HUMANML3D.STD_TEXT
+            self.Dataset = Text2MotionDatasetCB
+        elif cfg.TRAIN.STAGE == "token":
+            self.Dataset = Text2MotionDatasetToken
+            self.DatasetEval = Text2MotionDatasetToken
+        elif cfg.TRAIN.STAGE == "m2t":
+            self.Dataset = Text2MotionDatasetM2T
+            self.DatasetEval = Text2MotionDatasetM2T
+        else:
+            self.Dataset = Text2MotionDataset
+        # Get additional info of the dataset
+        self.nfeats = 263
+        cfg.DATASET.NFEATS = self.nfeats
+    def feats2joints(self, features):
+        mean = torch.tensor(self.hparams.mean).to(features)
+        std = torch.tensor(self.hparams.std).to(features)
+        features = features * std + mean
+        return recover_from_ric(features, self.njoints)
+    def joints2feats(self, features):
+        features = process_file(features, self.njoints)[0]
+        return features
+    def normalize(self, features):
+        mean = torch.tensor(self.hparams.mean).to(features)
+        std = torch.tensor(self.hparams.std).to(features)
+        features = (features - mean) / std
+        return features
+    def denormalize(self, features):
+        mean = torch.tensor(self.hparams.mean).to(features)
+        std = torch.tensor(self.hparams.std).to(features)
+        features = features * std + mean
+        return features
+    def renorm4t2m(self, features):
+        # renorm to t2m norms for using t2m evaluators
+        ori_mean = torch.tensor(self.hparams.mean).to(features)
+        ori_std = torch.tensor(self.hparams.std).to(features)
+        eval_mean = torch.tensor(self.hparams.mean_eval).to(features)
+        eval_std = torch.tensor(self.hparams.std_eval).to(features)
+        features = features * ori_std + ori_mean
+        features = (features - eval_mean) / eval_std
+        return features
+    def mm_mode(self, mm_on=True):
+        if mm_on:
+            self.is_mm = True
+            self.name_list = self.test_dataset.name_list
+            self.mm_list = np.random.choice(self.name_list,
+                                            self.cfg.METRIC.MM_NUM_SAMPLES,
+                                            replace=False)
+            self.test_dataset.name_list = self.mm_list
+        else:
+            self.is_mm = False
+            self.test_dataset.name_list = self.name_list

mGPT/data/Kit.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import numpy as np
+import torch
+from os.path import join as pjoin
+from .humanml.utils.word_vectorizer import WordVectorizer
+from .humanml.scripts.motion_process import (process_file, recover_from_ric)
+from .HumanML3D import HumanML3DDataModule
+from .humanml import Text2MotionDatasetEval, Text2MotionDataset, Text2MotionDatasetCB, MotionDataset, MotionDatasetVQ, Text2MotionDatasetToken
+class KitDataModule(HumanML3DDataModule):
+    def __init__(self, cfg, **kwargs):
+        super().__init__(cfg, **kwargs)
+        # Basic info of the dataset
+        self.name = "kit"
+        self.njoints = 21
+        # Path to the dataset
+        data_root = cfg.DATASET.KIT.ROOT
+        self.hparams.data_root = data_root
+        self.hparams.text_dir = pjoin(data_root, "texts")
+        self.hparams.motion_dir = pjoin(data_root, 'new_joint_vecs')
+        # Mean and std of the dataset
+        dis_data_root = pjoin(cfg.DATASET.KIT.MEAN_STD_PATH, 'kit',
+                              "VQVAEV3_CB1024_CMT_H1024_NRES3", "meta")
+        self.hparams.mean = np.load(pjoin(dis_data_root, "mean.npy"))
+        self.hparams.std = np.load(pjoin(dis_data_root, "std.npy"))
+        # Mean and std for fair evaluation
+        dis_data_root_eval = pjoin(cfg.DATASET.KIT.MEAN_STD_PATH, 't2m',
+                                   "Comp_v6_KLD005", "meta")
+        self.hparams.mean_eval = np.load(pjoin(dis_data_root_eval, "mean.npy"))
+        self.hparams.std_eval = np.load(pjoin(dis_data_root_eval, "std.npy"))
+        # Length of the dataset
+        self.hparams.max_motion_length = cfg.DATASET.KIT.MAX_MOTION_LEN
+        self.hparams.min_motion_length = cfg.DATASET.KIT.MIN_MOTION_LEN
+        self.hparams.max_text_len = cfg.DATASET.KIT.MAX_TEXT_LEN
+        self.hparams.unit_length = cfg.DATASET.KIT.UNIT_LEN
+        # Get additional info of the dataset
+        self._sample_set = self.get_sample_set(overrides={"split": "test", "tiny": True})
+        self.nfeats = self._sample_set.nfeats
+        cfg.DATASET.NFEATS = self.nfeats
+    def feats2joints(self, features):
+        mean = torch.tensor(self.hparams.mean).to(features)
+        std = torch.tensor(self.hparams.std).to(features)
+        features = features * std + mean
+        return recover_from_ric(features, self.njoints)
+    def joints2feats(self, features):
+        features = process_file(features, self.njoints)[0]
+        # mean = torch.tensor(self.hparams.mean).to(features)
+        # std = torch.tensor(self.hparams.std).to(features)
+        # features = (features - mean) / std
+        return features
+    def normalize(self, features):
+        mean = torch.tensor(self.hparams.mean).to(features)
+        std = torch.tensor(self.hparams.std).to(features)
+        features = (features - mean) / std
+        return features
+    def renorm4t2m(self, features):
+        # renorm to t2m norms for using t2m evaluators
+        ori_mean = torch.tensor(self.hparams.mean).to(features)
+        ori_std = torch.tensor(self.hparams.std).to(features)
+        eval_mean = torch.tensor(self.hparams.mean_eval).to(features)
+        eval_std = torch.tensor(self.hparams.std_eval).to(features)
+        features = features * ori_std + ori_mean
+        features = (features - eval_mean) / eval_std
+        return features
+    def mm_mode(self, mm_on=True):
+        # random select samples for mm
+        if mm_on:
+            self.is_mm = True
+            self.name_list = self.test_dataset.name_list
+            self.mm_list = np.random.choice(self.name_list,
+                                            self.cfg.METRIC.MM_NUM_SAMPLES,
+                                            replace=False)
+            self.test_dataset.name_list = self.mm_list
+        else:
+            self.is_mm = False
+            self.test_dataset.name_list = self.name_list

mGPT/data/__init__.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import pytorch_lightning as pl
+from torch.utils.data import DataLoader
+class BASEDataModule(pl.LightningDataModule):
+    def __init__(self, collate_fn):
+        super().__init__()
+        self.dataloader_options = {"collate_fn": collate_fn}
+        self.persistent_workers = True
+        self.is_mm = False
+        self._train_dataset = None
+        self._val_dataset = None
+        self._test_dataset = None
+    def get_sample_set(self, overrides={}):
+        sample_params = self.hparams.copy()
+        sample_params.update(overrides)
+        return self.DatasetEval(**sample_params)
+    @property
+    def train_dataset(self):
+        if self._train_dataset is None:
+            self._train_dataset = self.Dataset(split=self.cfg.TRAIN.SPLIT,
+                                               **self.hparams)
+        return self._train_dataset
+    @property
+    def val_dataset(self):
+        if self._val_dataset is None:
+            params = self.hparams.copy()
+            params['code_path'] = None
+            params['split'] = self.cfg.EVAL.SPLIT
+            self._val_dataset = self.DatasetEval(**params)
+        return self._val_dataset
+    @property
+    def test_dataset(self):
+        if self._test_dataset is None:
+            # self._test_dataset = self.DatasetEval(split=self.cfg.TEST.SPLIT,
+            #                                       **self.hparams)
+            params = self.hparams.copy()
+            params['code_path'] = None
+            params['split'] = self.cfg.TEST.SPLIT
+            self._test_dataset = self.DatasetEval( **params)
+        return self._test_dataset
+    def setup(self, stage=None):
+        # Use the getter the first time to load the data
+        if stage in (None, "fit"):
+            _ = self.train_dataset
+            _ = self.val_dataset
+        if stage in (None, "test"):
+            _ = self.test_dataset
+    def train_dataloader(self):
+        dataloader_options = self.dataloader_options.copy()
+        dataloader_options["batch_size"] = self.cfg.TRAIN.BATCH_SIZE
+        dataloader_options["num_workers"] = self.cfg.TRAIN.NUM_WORKERS
+        return DataLoader(
+            self.train_dataset,
+            shuffle=False,
+            persistent_workers=True,
+            **dataloader_options,
+        )
+    def predict_dataloader(self):
+        dataloader_options = self.dataloader_options.copy()
+        dataloader_options[
+            "batch_size"] = 1 if self.is_mm else self.cfg.TEST.BATCH_SIZE
+        dataloader_options["num_workers"] = self.cfg.TEST.NUM_WORKERS
+        dataloader_options["shuffle"] = False
+        return DataLoader(
+            self.test_dataset,
+            persistent_workers=True,
+            **dataloader_options,
+        )
+    def val_dataloader(self):
+        # overrides batch_size and num_workers
+        dataloader_options = self.dataloader_options.copy()
+        dataloader_options["batch_size"] = self.cfg.EVAL.BATCH_SIZE
+        dataloader_options["num_workers"] = self.cfg.EVAL.NUM_WORKERS
+        dataloader_options["shuffle"] = False
+        return DataLoader(
+            self.val_dataset,
+            persistent_workers=True,
+            **dataloader_options,
+        )
+    def test_dataloader(self):
+        # overrides batch_size and num_workers
+        dataloader_options = self.dataloader_options.copy()
+        dataloader_options[
+            "batch_size"] = 1 if self.is_mm else self.cfg.TEST.BATCH_SIZE
+        dataloader_options["num_workers"] = self.cfg.TEST.NUM_WORKERS
+        dataloader_options["shuffle"] = False
+        return DataLoader(
+            self.test_dataset,
+            persistent_workers=True,
+            **dataloader_options,
+        )

mGPT/data/build_data.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from omegaconf import OmegaConf
+from os.path import join as pjoin
+from mGPT.config import instantiate_from_config
+def build_data(cfg, phase="train"):
+    data_config = OmegaConf.to_container(cfg.DATASET, resolve=True)
+    data_config['params'] = {'cfg': cfg, 'phase': phase}
+    if isinstance(data_config['target'], str):
+        return instantiate_from_config(data_config)
+    elif isinstance(data_config['target'], list):
+        data_config_tmp = data_config.copy()
+        data_config_tmp['params']['dataModules'] = data_config['target']
+        data_config_tmp['target'] = 'mGPT.data.Concat.ConcatDataModule'
+        return instantiate_from_config(data_config)